From 413f1aed27baee6e543ad2992f7a829deaef670a Mon Sep 17 00:00:00 2001 From: Mick Semb Wever Date: Sun, 1 May 2022 23:22:08 +0200 Subject: [PATCH 001/159] Create release branch cassandra-4.1, increment trunk version to 4.2 --- CHANGES.txt | 6 ++++++ NEWS.txt | 14 ++++++++++++++ build.xml | 2 +- debian/changelog | 2 +- 4 files changed, 22 insertions(+), 2 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 050767a9438c..dc7f81eec759 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,3 +1,9 @@ +4.2 +Merged from 4.0: +Merged from 3.11: +Merged from 3.0: + + 4.1 * Add auto_snapshot_ttl configuration (CASSANDRA-16790) * List snapshots of dropped tables (CASSANDRA-16843) diff --git a/NEWS.txt b/NEWS.txt index 1280d0237922..5695e16f4bea 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -51,6 +51,20 @@ restore snapshots created with the previous major version using the 'sstableloader' tool. You can upgrade the file format of your snapshots using the provided 'sstableupgrade' tool. + +4.2 +=== + +New features +------------ + +Upgrading +--------- + +Deprecation +----------- + + 4.1 === diff --git a/build.xml b/build.xml index 343f7e87a055..d0466578ac06 100644 --- a/build.xml +++ b/build.xml @@ -33,7 +33,7 @@ - + diff --git a/debian/changelog b/debian/changelog index 53ea78889049..b2397e52fb16 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -cassandra (4.1) UNRELEASED; urgency=medium +cassandra (4.2) UNRELEASED; urgency=medium * New release From c48906394e01460382f4070ecc34f6f9754fc567 Mon Sep 17 00:00:00 2001 From: Yifan Cai Date: Thu, 5 May 2022 14:15:49 -0700 Subject: [PATCH 002/159] Fix testCDCIndexFileWriteOnSync and document cdc index file read edge case Patch by Yifan Cai; reviewed by Josh McKenzie for CASSANDRA-17416 --- .../cassandra/pages/operating/cdc.adoc | 7 ++++- .../CommitLogSegmentManagerCDCTest.java | 29 +++++++++++++++---- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/doc/modules/cassandra/pages/operating/cdc.adoc b/doc/modules/cassandra/pages/operating/cdc.adoc index 4ce9b0f52a6e..e12decd696be 100644 --- a/doc/modules/cassandra/pages/operating/cdc.adoc +++ b/doc/modules/cassandra/pages/operating/cdc.adoc @@ -16,12 +16,17 @@ persisted to disk. Upon final segment flush, a second line with the human-readable word "COMPLETED" will be added to the _cdc.idx file indicating that Cassandra has completed all processing on the file. -We we use an index file rather than just encouraging clients to parse +We use an index file rather than just encouraging clients to parse the log realtime off a memory mapped handle as data can be reflected in a kernel buffer that is not yet persisted to disk. Parsing only up to the listed offset in the _cdc.idx file will ensure that you only parse CDC data for data that is durable. +Please note that in rare chances, e.g. slow disk, it is possible for the +consumer to read an empty value from the _cdc.idx file because update is +achieved with first truncating the file then write to the file. In such +case, the consumer should retry read the index file. + A threshold of total disk space allowed is specified in the yaml at which time newly allocated CommitLogSegments will not allow CDC data until a consumer parses and removes files from the specified cdc_raw diff --git a/test/unit/org/apache/cassandra/db/commitlog/CommitLogSegmentManagerCDCTest.java b/test/unit/org/apache/cassandra/db/commitlog/CommitLogSegmentManagerCDCTest.java index 076f2fbad24f..da1586db6c62 100644 --- a/test/unit/org/apache/cassandra/db/commitlog/CommitLogSegmentManagerCDCTest.java +++ b/test/unit/org/apache/cassandra/db/commitlog/CommitLogSegmentManagerCDCTest.java @@ -24,7 +24,9 @@ import java.nio.file.Files; import java.nio.file.Path; import java.util.*; +import java.util.concurrent.TimeUnit; +import com.google.monitoring.runtime.instrumentation.common.util.concurrent.Uninterruptibles; import org.apache.cassandra.io.util.File; import org.apache.cassandra.io.util.FileReader; import org.junit.Assert; @@ -157,11 +159,28 @@ public void testCDCIndexFileWriteOnSync() throws IOException Assert.assertTrue("Index file not written: " + cdcIndexFile, cdcIndexFile.exists()); // Read index value and confirm it's == end from last sync - BufferedReader in = new BufferedReader(new FileReader(cdcIndexFile)); - String input = in.readLine(); - Integer offset = Integer.parseInt(input); - Assert.assertEquals(syncOffset, (long)offset); - in.close(); + String input = null; + // There could be a race between index file update (truncate & write) and read. See CASSANDRA-17416 + // It is possible to read an empty line. In this case, re-try at most 5 times. + for (int i = 0; input == null && i < 5; i++) + { + if (i != 0) // add a little pause between each attempt + Uninterruptibles.sleepUninterruptibly(10, TimeUnit.MILLISECONDS); + + try (BufferedReader in = new BufferedReader(new FileReader(cdcIndexFile))) + { + input = in.readLine(); + } + } + + if (input == null) + { + Assert.fail("Unable to read the CDC index file after several attempts"); + } + + int indexOffset = Integer.parseInt(input); + Assert.assertTrue("The offset read from CDC index file should be equal or larger than the offset after sync. See CASSANDRA-17416", + syncOffset <= indexOffset); } @Test From 013acc641c5d487b07be5c082af1e85d26bd127f Mon Sep 17 00:00:00 2001 From: Bernardo Botella Corbi Date: Mon, 18 Apr 2022 16:09:21 -0700 Subject: [PATCH 003/159] Rename DisableFlag class to EnableFlag on guardrails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit patch by Bernardo Botella Corbi; reviewed by Andrés de la Peña, Josh McKenzie and Yifan Cai for CASSANDRA-17544 --- CHANGES.txt | 1 + .../cassandra/db/guardrails/DisableFlag.java | 86 ------------------- .../cassandra/db/guardrails/EnableFlag.java | 86 +++++++++++++++++++ .../cassandra/db/guardrails/Guardrails.java | 64 +++++++------- .../db/guardrails/GuardrailsTest.java | 32 +++---- 5 files changed, 135 insertions(+), 134 deletions(-) delete mode 100644 src/java/org/apache/cassandra/db/guardrails/DisableFlag.java create mode 100644 src/java/org/apache/cassandra/db/guardrails/EnableFlag.java diff --git a/CHANGES.txt b/CHANGES.txt index d39b8cbc3127..306acc1ec435 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Rename DisableFlag class to EnableFlag on guardrails (CASSANDRA-17544) Merged from 4.1: Merged from 4.0: Merged from 3.11: diff --git a/src/java/org/apache/cassandra/db/guardrails/DisableFlag.java b/src/java/org/apache/cassandra/db/guardrails/DisableFlag.java deleted file mode 100644 index 9ec1951d2782..000000000000 --- a/src/java/org/apache/cassandra/db/guardrails/DisableFlag.java +++ /dev/null @@ -1,86 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.cassandra.db.guardrails; - -import java.util.function.Predicate; -import javax.annotation.Nullable; - -import org.apache.cassandra.service.ClientState; - -/** - * A guardrail that completely disables the use of a particular feature. - * - *

Note that this guardrail only aborts operations (if the feature is disabled) so is only meant for - * query-based guardrails (we're happy to reject queries deemed dangerous, but we don't want to create a guardrail - * that breaks compaction for instance). - */ -public class DisableFlag extends Guardrail -{ - private final Predicate disabled; - private final String what; - - /** - * Creates a new {@link DisableFlag} guardrail. - * - * @param name the identifying name of the guardrail - * @param disabled a {@link ClientState}-based supplier of boolean indicating whether the feature guarded by this - * guardrail must be disabled. - * @param what The feature that is guarded by this guardrail (for reporting in error messages), - * {@link DisableFlag#ensureEnabled(String, ClientState)} can specify a different {@code what}. - */ - public DisableFlag(String name, Predicate disabled, String what) - { - super(name); - this.disabled = disabled; - this.what = what; - } - - /** - * Aborts the operation if this guardrail is disabled. - * - *

This must be called when the feature guarded by this guardrail is used to ensure such use is in fact - * allowed. - * - * @param state The client state, used to skip the check if the query is internal or is done by a superuser. - * A {@code null} value means that the check should be done regardless of the query. - */ - public void ensureEnabled(@Nullable ClientState state) - { - ensureEnabled(what, state); - } - - /** - * Aborts the operation if this guardrail is disabled. - * - *

This must be called when the feature guarded by this guardrail is used to ensure such use is in fact - * allowed. - * - * @param what The feature that is guarded by this guardrail (for reporting in error messages). - * @param state The client state, used to skip the check if the query is internal or is done by a superuser. - * A {@code null} value means that the check should be done regardless of the query, although it won't - * throw any exception if the failure threshold is exceeded. This is so because checks without an - * associated client come from asynchronous processes such as compaction, and we don't want to - * interrupt such processes. - */ - public void ensureEnabled(String what, @Nullable ClientState state) - { - if (enabled(state) && disabled.test(state)) - fail(what + " is not allowed", state); - } -} diff --git a/src/java/org/apache/cassandra/db/guardrails/EnableFlag.java b/src/java/org/apache/cassandra/db/guardrails/EnableFlag.java new file mode 100644 index 000000000000..aba013a65ff3 --- /dev/null +++ b/src/java/org/apache/cassandra/db/guardrails/EnableFlag.java @@ -0,0 +1,86 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.guardrails; + +import java.util.function.Predicate; +import javax.annotation.Nullable; + +import org.apache.cassandra.service.ClientState; + +/** + * A guardrail that enables the use of a particular feature. + * + *

Note that this guardrail only aborts operations (if the feature is not enabled) so is only meant for query-based + * guardrails (we're happy to reject queries deemed dangerous, but we don't want to create a guardrail that breaks + * compaction for instance). + */ +public class EnableFlag extends Guardrail +{ + private final Predicate enabled; + private final String featureName; + + /** + * Creates a new {@link EnableFlag} guardrail. + * + * @param name the identifying name of the guardrail + * @param enabled a {@link ClientState}-based supplier of boolean indicating whether the feature guarded by this + * guardrail is enabled. + * @param featureName The feature that is guarded by this guardrail (for reporting in error messages), {@link + * EnableFlag#ensureEnabled(String, ClientState)} can specify a different {@code featureName}. + */ + public EnableFlag(String name, Predicate enabled, String featureName) + { + super(name); + this.enabled = enabled; + this.featureName = featureName; + } + + /** + * Aborts the operation if this guardrail is not enabled. + * + *

This must be called when the feature guarded by this guardrail is used to ensure such use is in fact + * allowed. + * + * @param state The client state, used to skip the check if the query is internal or is done by a superuser. + * A {@code null} value means that the check should be done regardless of the query. + */ + public void ensureEnabled(@Nullable ClientState state) + { + ensureEnabled(featureName, state); + } + + /** + * Aborts the operation if this guardrail is not enabled. + * + *

This must be called when the feature guarded by this guardrail is used to ensure such use is in fact + * allowed. + * + * @param featureName The feature that is guarded by this guardrail (for reporting in error messages). + * @param state The client state, used to skip the check if the query is internal or is done by a superuser. A + * {@code null} value means that the check should be done regardless of the query, although it + * won't throw any exception if the failure threshold is exceeded. This is so because checks + * without an associated client come from asynchronous processes such as compaction, and we don't + * want to interrupt such processes. + */ + public void ensureEnabled(String featureName, @Nullable ClientState state) + { + if (enabled(state) && !enabled.test(state)) + fail(featureName + " is not allowed", state); + } +} diff --git a/src/java/org/apache/cassandra/db/guardrails/Guardrails.java b/src/java/org/apache/cassandra/db/guardrails/Guardrails.java index b670f87cd766..a4811ac7f6b2 100644 --- a/src/java/org/apache/cassandra/db/guardrails/Guardrails.java +++ b/src/java/org/apache/cassandra/db/guardrails/Guardrails.java @@ -104,10 +104,10 @@ public final class Guardrails implements GuardrailsMBean /** * Guardrail disabling user's ability to create secondary indexes */ - public static final DisableFlag createSecondaryIndexesEnabled = - new DisableFlag("secondary_indexes", - state -> !CONFIG_PROVIDER.getOrCreate(state).getSecondaryIndexesEnabled(), - "User creation of secondary indexes"); + public static final EnableFlag createSecondaryIndexesEnabled = + new EnableFlag("secondary_indexes", + state -> CONFIG_PROVIDER.getOrCreate(state).getSecondaryIndexesEnabled(), + "User creation of secondary indexes"); /** * Guardrail on the number of materialized views per table. @@ -135,36 +135,36 @@ public final class Guardrails implements GuardrailsMBean /** * Guardrail disabling user-provided timestamps. */ - public static final DisableFlag userTimestampsEnabled = - new DisableFlag("user_timestamps", - state -> !CONFIG_PROVIDER.getOrCreate(state).getUserTimestampsEnabled(), - "User provided timestamps (USING TIMESTAMP)"); + public static final EnableFlag userTimestampsEnabled = + new EnableFlag("user_timestamps", + state -> CONFIG_PROVIDER.getOrCreate(state).getUserTimestampsEnabled(), + "User provided timestamps (USING TIMESTAMP)"); - public static final DisableFlag groupByEnabled = - new DisableFlag("group_by", - state -> !CONFIG_PROVIDER.getOrCreate(state).getGroupByEnabled(), - "GROUP BY functionality"); + public static final EnableFlag groupByEnabled = + new EnableFlag("group_by", + state -> CONFIG_PROVIDER.getOrCreate(state).getGroupByEnabled(), + "GROUP BY functionality"); - public static final DisableFlag dropTruncateTableEnabled = - new DisableFlag("drop_truncate_table_enabled", - state -> !CONFIG_PROVIDER.getOrCreate(state).getDropTruncateTableEnabled(), - "DROP and TRUNCATE TABLE functionality"); + public static final EnableFlag dropTruncateTableEnabled = + new EnableFlag("drop_truncate_table_enabled", + state -> CONFIG_PROVIDER.getOrCreate(state).getDropTruncateTableEnabled(), + "DROP and TRUNCATE TABLE functionality"); /** * Guardrail disabling user's ability to turn off compression */ - public static final DisableFlag uncompressedTablesEnabled = - new DisableFlag("uncompressed_tables_enabled", - state -> !CONFIG_PROVIDER.getOrCreate(state).getUncompressedTablesEnabled(), - "Uncompressed table"); + public static final EnableFlag uncompressedTablesEnabled = + new EnableFlag("uncompressed_tables_enabled", + state -> CONFIG_PROVIDER.getOrCreate(state).getUncompressedTablesEnabled(), + "Uncompressed table"); /** * Guardrail disabling the creation of new COMPACT STORAGE tables */ - public static final DisableFlag compactTablesEnabled = - new DisableFlag("compact_tables", - state -> !CONFIG_PROVIDER.getOrCreate(state).getCompactTablesEnabled(), - "Creation of new COMPACT STORAGE tables"); + public static final EnableFlag compactTablesEnabled = + new EnableFlag("compact_tables", + state -> CONFIG_PROVIDER.getOrCreate(state).getCompactTablesEnabled(), + "Creation of new COMPACT STORAGE tables"); /** * Guardrail on the number of elements returned within page. @@ -197,18 +197,18 @@ public final class Guardrails implements GuardrailsMBean /** * Guardrail disabling operations on lists that require read before write. */ - public static final DisableFlag readBeforeWriteListOperationsEnabled = - new DisableFlag("read_before_write_list_operations", - state -> !CONFIG_PROVIDER.getOrCreate(state).getReadBeforeWriteListOperationsEnabled(), - "List operation requiring read before write"); + public static final EnableFlag readBeforeWriteListOperationsEnabled = + new EnableFlag("read_before_write_list_operations", + state -> CONFIG_PROVIDER.getOrCreate(state).getReadBeforeWriteListOperationsEnabled(), + "List operation requiring read before write"); /** * Guardrail disabling ALLOW FILTERING statement within a query */ - public static final DisableFlag allowFilteringEnabled = - new DisableFlag("allow_filtering", - state -> !CONFIG_PROVIDER.getOrCreate(state).getAllowFilteringEnabled(), - "Querying with ALLOW FILTERING"); + public static final EnableFlag allowFilteringEnabled = + new EnableFlag("allow_filtering", + state -> CONFIG_PROVIDER.getOrCreate(state).getAllowFilteringEnabled(), + "Querying with ALLOW FILTERING"); /** * Guardrail on the number of restrictions created by a cartesian product of a CQL's {@code IN} query. diff --git a/test/unit/org/apache/cassandra/db/guardrails/GuardrailsTest.java b/test/unit/org/apache/cassandra/db/guardrails/GuardrailsTest.java index a0a5823b018b..5c7e724abb10 100644 --- a/test/unit/org/apache/cassandra/db/guardrails/GuardrailsTest.java +++ b/test/unit/org/apache/cassandra/db/guardrails/GuardrailsTest.java @@ -67,8 +67,8 @@ public void testMaxThreshold() throws Throwable MaxThreshold guard = new MaxThreshold("x", state -> 10, state -> 100, - (isWarn, what, v, t) -> format("%s: for %s, %s > %s", - isWarn ? "Warning" : "Aborting", what, v, t)); + (isWarn, featureName, v, t) -> format("%s: for %s, %s > %s", + isWarn ? "Warning" : "Aborting", featureName, v, t)); assertTrue(guard.enabled(userClientState)); @@ -93,8 +93,8 @@ public void testWarnOnlyMaxThreshold() throws Throwable MaxThreshold guard = new MaxThreshold("x", state -> 10, state -> DISABLED, - (isWarn, what, v, t) -> format("%s: for %s, %s > %s", - isWarn ? "Warning" : "Aborting", what, v, t)); + (isWarn, featureName, v, t) -> format("%s: for %s, %s > %s", + isWarn ? "Warning" : "Aborting", featureName, v, t)); assertTrue(guard.enabled(userClientState)); @@ -111,8 +111,8 @@ public void testFailOnlyMaxThreshold() throws Throwable MaxThreshold guard = new MaxThreshold("x", state -> DISABLED, state -> 10, - (isWarn, what, v, t) -> format("%s: for %s, %s > %s", - isWarn ? "Warning" : "Aborting", what, v, t)); + (isWarn, featureName, v, t) -> format("%s: for %s, %s > %s", + isWarn ? "Warning" : "Aborting", featureName, v, t)); assertTrue(guard.enabled(userClientState)); @@ -129,8 +129,8 @@ public void testMaxThresholdUsers() throws Throwable MaxThreshold guard = new MaxThreshold("x", state -> 10, state -> 100, - (isWarn, what, v, t) -> format("%s: for %s, %s > %s", - isWarn ? "Warning" : "Failure", what, v, t)); + (isWarn, featureName, v, t) -> format("%s: for %s, %s > %s", + isWarn ? "Warning" : "Failure", featureName, v, t)); // value under both thresholds assertValid(() -> guard.guard(5, "x", false, null)); @@ -251,25 +251,25 @@ public void testMinThresholdUsers() throws Throwable } @Test - public void testDisableFlag() throws Throwable + public void testEnableFlag() throws Throwable { - assertFails(() -> new DisableFlag("x", state -> true, "X").ensureEnabled(userClientState), "X is not allowed"); - assertValid(() -> new DisableFlag("x", state -> false, "X").ensureEnabled(userClientState)); + assertFails(() -> new EnableFlag("x", state -> false, "X").ensureEnabled(userClientState), "X is not allowed"); + assertValid(() -> new EnableFlag("x", state -> true, "X").ensureEnabled(userClientState)); - assertFails(() -> new DisableFlag("x", state -> true, "X").ensureEnabled("Y", userClientState), "Y is not allowed"); - assertValid(() -> new DisableFlag("x", state -> false, "X").ensureEnabled("Y", userClientState)); + assertFails(() -> new EnableFlag("x", state -> false, "X").ensureEnabled("Y", userClientState), "Y is not allowed"); + assertValid(() -> new EnableFlag("x", state -> true, "X").ensureEnabled("Y", userClientState)); } @Test - public void testDisableFlagUsers() throws Throwable + public void testEnableFlagUsers() throws Throwable { - DisableFlag enabled = new DisableFlag("x", state -> false, "X"); + EnableFlag enabled = new EnableFlag("x", state -> true, "X"); assertValid(() -> enabled.ensureEnabled(null)); assertValid(() -> enabled.ensureEnabled(userClientState)); assertValid(() -> enabled.ensureEnabled(systemClientState)); assertValid(() -> enabled.ensureEnabled(superClientState)); - DisableFlag disabled = new DisableFlag("x", state -> true, "X"); + EnableFlag disabled = new EnableFlag("x", state -> false, "X"); assertFails(() -> disabled.ensureEnabled(userClientState), "X is not allowed"); assertValid(() -> disabled.ensureEnabled(systemClientState)); assertValid(() -> disabled.ensureEnabled(superClientState)); From ce515a3d77c2042575827e8bdb1da639fc957491 Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Wed, 23 Mar 2022 12:42:36 -0400 Subject: [PATCH 004/159] Add guardrail for ALTER TABLE ADD / DROP / REMOVE column operations Patch by Josh McKenzie; reviewed by Jon Meredith for CASSANDRA-17495 --- CHANGES.txt | 1 + NEWS.txt | 2 + conf/cassandra.yaml | 28 +++- .../org/apache/cassandra/config/Config.java | 1 + .../cassandra/config/GuardrailsOptions.java | 14 ++ .../schema/AlterTableStatement.java | 13 ++ .../cassandra/db/guardrails/Guardrails.java | 20 +++ .../db/guardrails/GuardrailsConfig.java | 7 + .../db/guardrails/GuardrailsMBean.java | 14 ++ .../cassandra/service/StorageService.java | 1 - .../guardrails/GuardrailAlterTableTest.java | 133 ++++++++++++++++++ 11 files changed, 232 insertions(+), 2 deletions(-) create mode 100644 test/unit/org/apache/cassandra/db/guardrails/GuardrailAlterTableTest.java diff --git a/CHANGES.txt b/CHANGES.txt index 306acc1ec435..f3c79ce3dafe 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Add guardrail for ALTER TABLE ADD / DROP / REMOVE column operations (CASSANDRA-17495) * Rename DisableFlag class to EnableFlag on guardrails (CASSANDRA-17544) Merged from 4.1: Merged from 4.0: diff --git a/NEWS.txt b/NEWS.txt index fcc5d612b8d5..d42d03afe6d2 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -57,6 +57,8 @@ using the provided 'sstableupgrade' tool. New features ------------ + - New Guardrails added: + - Whether ALTER TABLE commands are allowed to mutate columns Upgrading --------- diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml index ccc941da7653..6932a5fbcf39 100644 --- a/conf/cassandra.yaml +++ b/conf/cassandra.yaml @@ -1601,58 +1601,75 @@ drop_compact_storage_enabled: false # The two thresholds default to -1 to disable. # keyspaces_warn_threshold: -1 # keyspaces_fail_threshold: -1 +# # Guardrail to warn or fail when creating more user tables than threshold. # The two thresholds default to -1 to disable. # tables_warn_threshold: -1 # tables_fail_threshold: -1 +# # Guardrail to enable or disable the ability to create uncompressed tables # uncompressed_tables_enabled: true +# # Guardrail to warn or fail when creating/altering a table with more columns per table than threshold. # The two thresholds default to -1 to disable. # columns_per_table_warn_threshold: -1 # columns_per_table_fail_threshold: -1 +# # Guardrail to warn or fail when creating more secondary indexes per table than threshold. # The two thresholds default to -1 to disable. # secondary_indexes_per_table_warn_threshold: -1 # secondary_indexes_per_table_fail_threshold: -1 +# # Guardrail to enable or disable the creation of secondary indexes # secondary_indexes_enabled: true +# # Guardrail to warn or fail when creating more materialized views per table than threshold. # The two thresholds default to -1 to disable. # materialized_views_per_table_warn_threshold: -1 # materialized_views_per_table_fail_threshold: -1 +# # Guardrail to warn about, ignore or reject properties when creating tables. By default all properties are allowed. # table_properties_warned: [] # table_properties_ignored: [] # table_properties_disallowed: [] +# # Guardrail to allow/disallow user-provided timestamps. Defaults to true. # user_timestamps_enabled: true +# # Guardrail to allow/disallow GROUP BY functionality. # group_by_enabled: true +# # Guardrail to allow/disallow TRUNCATE and DROP TABLE statements # drop_truncate_table_enabled: true +# # Guardrail to warn or fail when using a page size greater than threshold. # The two thresholds default to -1 to disable. # page_size_warn_threshold: -1 # page_size_fail_threshold: -1 +# # Guardrail to allow/disallow list operations that require read before write, i.e. setting list element by index and # removing list elements by either index or value. Defaults to true. # read_before_write_list_operations_enabled: true +# # Guardrail to warn or fail when querying with an IN restriction selecting more partition keys than threshold. # The two thresholds default to -1 to disable. # partition_keys_in_select_warn_threshold: -1 # partition_keys_in_select_fail_threshold: -1 +# # Guardrail to warn or fail when an IN query creates a cartesian product with a size exceeding threshold, # eg. "a in (1,2,...10) and b in (1,2...10)" results in cartesian product of 100. # The two thresholds default to -1 to disable. # in_select_cartesian_product_warn_threshold: -1 # in_select_cartesian_product_fail_threshold: -1 +# # Guardrail to warn about or reject read consistency levels. By default, all consistency levels are allowed. # read_consistency_levels_warned: [] # read_consistency_levels_disallowed: [] +# # Guardrail to warn about or reject write consistency levels. By default, all consistency levels are allowed. # write_consistency_levels_warned: [] # write_consistency_levels_disallowed: [] +# # Guardrail to warn or fail when encountering larger size of collection data than threshold. # At query time this guardrail is applied only to the collection fragment that is being writen, even though in the case # of non-frozen collections there could be unaccounted parts of the collection on the sstables. This is done this way to @@ -1661,6 +1678,7 @@ drop_compact_storage_enabled: false # The two thresholds default to null to disable. # collection_size_warn_threshold: # collection_size_fail_threshold: +# # Guardrail to warn or fail when encountering more elements in collection than threshold. # At query time this guardrail is applied only to the collection fragment that is being writen, even though in the case # of non-frozen collections there could be unaccounted parts of the collection on the sstables. This is done this way to @@ -1669,12 +1687,18 @@ drop_compact_storage_enabled: false # The two thresholds default to -1 to disable. # items_per_collection_warn_threshold: -1 # items_per_collection_fail_threshold: -1 +# # Guardrail to allow/disallow querying with ALLOW FILTERING. Defaults to true. # allow_filtering_enabled: true +# # Guardrail to warn or fail when creating a user-defined-type with more fields in than threshold. # Default -1 to disable. # fields_per_udt_warn_threshold: -1 # fields_per_udt_fail_threshold: -1 +# +# Guardrail to indicate whether or not users are allowed to use ALTER TABLE commands to make column changes to tables +# alter_table_enabled: true +# # Guardrail to warn or fail when local data disk usage percentage exceeds threshold. Valid values are in [1, 100]. # This is only used for the disks storing data directories, so it won't count any separate disks used for storing # the commitlog, hints nor saved caches. The disk usage is the ratio between the amount of space used by the data @@ -1686,13 +1710,15 @@ drop_compact_storage_enabled: false # The two thresholds default to -1 to disable. # data_disk_usage_percentage_warn_threshold: -1 # data_disk_usage_percentage_fail_threshold: -1 -# Allows defining the max disk size of the data directories when calculating thresholds for +# +# Guardrail that allows users to define the max disk size of the data directories when calculating thresholds for # disk_usage_percentage_warn_threshold and disk_usage_percentage_fail_threshold, so if this is greater than zero they # become percentages of a fixed size on disk instead of percentages of the physically available disk size. This should # be useful when we have a large disk and we only want to use a part of it for Cassandra's data directories. # Valid values are in [1, max available disk size of all data directories]. # Defaults to null to disable and use the physically available disk size of data directories during calculations. # data_disk_usage_max_disk_size: +# # Guardrail to warn or fail when the minimum replication factor is lesser than threshold. # This would also apply to system keyspaces. # Suggested value for use in production: 2 or higher diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java index 24b902978ef8..af0efbefca28 100644 --- a/src/java/org/apache/cassandra/config/Config.java +++ b/src/java/org/apache/cassandra/config/Config.java @@ -825,6 +825,7 @@ public static void setClientMode(boolean clientMode) public volatile Set write_consistency_levels_warned = Collections.emptySet(); public volatile Set write_consistency_levels_disallowed = Collections.emptySet(); public volatile boolean user_timestamps_enabled = true; + public volatile boolean alter_table_enabled = true; public volatile boolean group_by_enabled = true; public volatile boolean drop_truncate_table_enabled = true; public volatile boolean secondary_indexes_enabled = true; diff --git a/src/java/org/apache/cassandra/config/GuardrailsOptions.java b/src/java/org/apache/cassandra/config/GuardrailsOptions.java index a5144b941ba5..301a2527ee5c 100644 --- a/src/java/org/apache/cassandra/config/GuardrailsOptions.java +++ b/src/java/org/apache/cassandra/config/GuardrailsOptions.java @@ -385,6 +385,20 @@ public void setCompactTablesEnabled(boolean enabled) x -> config.compact_tables_enabled = x); } + @Override + public boolean getAlterTableEnabled() + { + return config.alter_table_enabled; + } + + public void setAlterTableEnabled(boolean enabled) + { + updatePropertyWithLogging("alter_table_enabled", + enabled, + () -> config.alter_table_enabled, + x -> config.alter_table_enabled = x); + } + @Override public boolean getReadBeforeWriteListOperationsEnabled() { diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java index 27362526884a..94f72d427a6a 100644 --- a/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/schema/AlterTableStatement.java @@ -79,6 +79,7 @@ public abstract class AlterTableStatement extends AlterSchemaStatement { protected final String tableName; private final boolean ifExists; + protected ClientState state; public AlterTableStatement(String keyspaceName, String tableName, boolean ifExists) { @@ -87,6 +88,15 @@ public AlterTableStatement(String keyspaceName, String tableName, boolean ifExis this.ifExists = ifExists; } + @Override + public void validate(ClientState state) + { + super.validate(state); + + // save the query state to use it for guardrails validation in #apply + this.state = state; + } + public Keyspaces apply(Keyspaces schema) { KeyspaceMetadata keyspace = schema.getNullable(keyspaceName); @@ -187,6 +197,7 @@ public void validate(ClientState state) public KeyspaceMetadata apply(KeyspaceMetadata keyspace, TableMetadata table) { + Guardrails.alterTableEnabled.ensureEnabled("ALTER TABLE changing columns", state); TableMetadata.Builder tableBuilder = table.unbuild(); Views.Builder viewsBuilder = keyspace.views.unbuild(); newColumns.forEach(c -> addColumn(keyspace, table, c, ifColumnNotExists, tableBuilder, viewsBuilder)); @@ -289,6 +300,7 @@ private DropColumns(String keyspaceName, String tableName, Set public KeyspaceMetadata apply(KeyspaceMetadata keyspace, TableMetadata table) { + Guardrails.alterTableEnabled.ensureEnabled("ALTER TABLE changing columns", state); TableMetadata.Builder builder = table.unbuild(); removedColumns.forEach(c -> dropColumn(keyspace, table, c, ifColumnExists, builder)); return keyspace.withSwapped(keyspace.tables.withSwapped(builder.build())); @@ -356,6 +368,7 @@ private RenameColumns(String keyspaceName, String tableName, Map renameColumn(keyspace, table, o, n, ifColumnsExists, tableBuilder, viewsBuilder)); diff --git a/src/java/org/apache/cassandra/db/guardrails/Guardrails.java b/src/java/org/apache/cassandra/db/guardrails/Guardrails.java index a4811ac7f6b2..a991f69f6e3f 100644 --- a/src/java/org/apache/cassandra/db/guardrails/Guardrails.java +++ b/src/java/org/apache/cassandra/db/guardrails/Guardrails.java @@ -145,6 +145,14 @@ public final class Guardrails implements GuardrailsMBean state -> CONFIG_PROVIDER.getOrCreate(state).getGroupByEnabled(), "GROUP BY functionality"); + /** + * Guardrail disabling ALTER TABLE column mutation access. + */ + public static final EnableFlag alterTableEnabled = + new EnableFlag("alter_table", + state -> CONFIG_PROVIDER.getOrCreate(state).getAlterTableEnabled(), + "User access to ALTER TABLE statement for column mutation"); + public static final EnableFlag dropTruncateTableEnabled = new EnableFlag("drop_truncate_table_enabled", state -> CONFIG_PROVIDER.getOrCreate(state).getDropTruncateTableEnabled(), @@ -539,6 +547,18 @@ public void setUserTimestampsEnabled(boolean enabled) DEFAULT_CONFIG.setUserTimestampsEnabled(enabled); } + @Override + public boolean getAlterTableEnabled() + { + return DEFAULT_CONFIG.getAlterTableEnabled(); + } + + @Override + public void setAlterTableEnabled(boolean enabled) + { + DEFAULT_CONFIG.setAlterTableEnabled(enabled); + } + @Override public boolean getAllowFilteringEnabled() { diff --git a/src/java/org/apache/cassandra/db/guardrails/GuardrailsConfig.java b/src/java/org/apache/cassandra/db/guardrails/GuardrailsConfig.java index e33ea5ba41b2..4fcfb1882309 100644 --- a/src/java/org/apache/cassandra/db/guardrails/GuardrailsConfig.java +++ b/src/java/org/apache/cassandra/db/guardrails/GuardrailsConfig.java @@ -132,6 +132,13 @@ public interface GuardrailsConfig */ boolean getUserTimestampsEnabled(); + /** + * Returns whether users are allowed access to the ALTER TABLE statement to mutate columns or not + * + * @return {@code true} if ALTER TABLE ADD/REMOVE/RENAME is allowed, {@code false} otherwise. + */ + boolean getAlterTableEnabled(); + /** * Returns whether tables can be uncompressed * diff --git a/src/java/org/apache/cassandra/db/guardrails/GuardrailsMBean.java b/src/java/org/apache/cassandra/db/guardrails/GuardrailsMBean.java index ad2eddaec023..30be464d023c 100644 --- a/src/java/org/apache/cassandra/db/guardrails/GuardrailsMBean.java +++ b/src/java/org/apache/cassandra/db/guardrails/GuardrailsMBean.java @@ -250,6 +250,20 @@ public interface GuardrailsMBean */ void setCompactTablesEnabled(boolean enabled); + /** + * Gets whether users can use the ALTER TABLE statement to change columns + * + * @return {@code true} if ALTER TABLE is allowed, {@code false} otherwise. + */ + boolean getAlterTableEnabled(); + + /** + * Sets whether users can use the ALTER TABLE statement to change columns + * + * @param enabled {@code true} if changing columns is allowed, {@code false} otherwise. + */ + void setAlterTableEnabled(boolean enabled); + /** * Returns whether GROUP BY queries are allowed. * diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java index dd35c70fdd2a..eb662493f036 100644 --- a/src/java/org/apache/cassandra/service/StorageService.java +++ b/src/java/org/apache/cassandra/service/StorageService.java @@ -17,7 +17,6 @@ */ package org.apache.cassandra.service; - import java.io.ByteArrayInputStream; import java.io.DataInputStream; import java.io.IOError; diff --git a/test/unit/org/apache/cassandra/db/guardrails/GuardrailAlterTableTest.java b/test/unit/org/apache/cassandra/db/guardrails/GuardrailAlterTableTest.java new file mode 100644 index 000000000000..1aa7095e191f --- /dev/null +++ b/test/unit/org/apache/cassandra/db/guardrails/GuardrailAlterTableTest.java @@ -0,0 +1,133 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.guardrails; + +import org.junit.After; +import org.junit.Before; +import org.junit.Test; + + +/** + * Tests the guardrail for disabling user access to the ALTER TABLE statement, {@link Guardrails#alterTableEnabled}. + * + * NOTE: This test class depends on {@link #currentTable()} method for setup, cleanup, and execution of tests. You'll + * need to refactor this if you add tests that make changes to the current table as the test teardown will no longer match + * setup. + */ +public class GuardrailAlterTableTest extends GuardrailTester +{ + public GuardrailAlterTableTest() + { + super(Guardrails.alterTableEnabled); + } + + @Before + public void setupTest() throws Throwable + { + createTable("CREATE TABLE IF NOT EXISTS %s (k INT, c INT, v TEXT, PRIMARY KEY(k, c))"); + } + + @After + public void afterTest() throws Throwable + { + dropTable("DROP TABLE %s"); + setGuardrail(true); + } + + private void setGuardrail(boolean alterTableEnabled) + { + guardrails().setAlterTableEnabled(alterTableEnabled); + } + + /** + * Confirm that ALTER TABLE queries either work (guardrail enabled) or fail (guardrail disabled) appropriately + * @throws Throwable + */ + @Test + public void testGuardrailEnabledAndDisabled() throws Throwable + { + setGuardrail(false); + assertFails("ALTER TABLE %s ADD test_one text;", "changing columns"); + + setGuardrail(true); + assertValid("ALTER TABLE %s ADD test_two text;"); + + setGuardrail(false); + assertFails("ALTER TABLE %s ADD test_three text;", "changing columns"); + } + + /** + * Confirm the guardrail appropriately catches the ALTER DROP case on a column + * @throws Throwable + */ + @Test + public void testAppliesToAlterDropColumn() throws Throwable + { + setGuardrail(true); + assertValid("ALTER TABLE %s ADD test_one text;"); + + setGuardrail(false); + assertFails("ALTER TABLE %s DROP test_one", "changing columns"); + + setGuardrail(true); + assertValid("ALTER TABLE %s DROP test_one"); + } + + /** + * Confirm the guardrail appropriately catches the ALTER RENAME case on a column + * @throws Throwable + */ + @Test + public void testAppliesToAlterRenameColumn() throws Throwable + { + setGuardrail(false); + assertFails("ALTER TABLE %s RENAME c TO renamed_c", "changing columns"); + + setGuardrail(true); + assertValid("ALTER TABLE %s RENAME c TO renamed_c"); + } + + /** + * Confirm we can always alter properties via the options map regardless of guardrail state + * @throws Throwable + */ + @Test + public void testAlterViaMapAlwaysWorks() throws Throwable + { + setGuardrail(false); + assertValid("ALTER TABLE %s WITH compression = { 'class' : 'SnappyCompressor', 'chunk_length_in_kb' : 32 };"); + + setGuardrail(true); + assertValid("ALTER TABLE %s WITH compression = { 'class' : 'SnappyCompressor', 'chunk_length_in_kb' : 32 };"); + } + + /** + * Confirm the other form of ALTER TABLE property map changing always works regardless of guardrail state + * @throws Throwable + */ + @Test + public void testAlterOptionsAlwaysWorks() throws Throwable + { + setGuardrail(true); + assertValid("ALTER TABLE %s WITH GC_GRACE_SECONDS = 456; "); + + setGuardrail(false); + assertValid("ALTER TABLE %s WITH GC_GRACE_SECONDS = 123; "); + } +} From 26dd119679605bf61ad3caa24a70509e5be5aac9 Mon Sep 17 00:00:00 2001 From: Yifan Cai Date: Wed, 18 May 2022 10:27:16 -0700 Subject: [PATCH 005/159] Add new CQL function maxWritetime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit patch by Yifan Cai; reviewed by Andres de la Peña, Francisco Guerrero for CASSANDRA-17425 --- CHANGES.txt | 1 + NEWS.txt | 3 + doc/cql3/CQL.textile | 6 +- .../cassandra/pages/cql/appendices.adoc | 1 + .../cassandra/pages/cql/cql_singlefile.adoc | 9 +- doc/modules/cassandra/pages/cql/dml.adoc | 13 +-- pylib/cqlshlib/cql3handling.py | 1 + src/antlr/Lexer.g | 1 + src/antlr/Parser.g | 7 +- .../cql3/selection/ResultSetBuilder.java | 25 ++++++ .../cassandra/cql3/selection/Selectable.java | 55 +++++++++--- .../cassandra/cql3/selection/Selection.java | 22 ++++- .../cassandra/cql3/selection/Selector.java | 24 +++++- .../cql3/selection/SelectorFactories.java | 25 +++++- .../selection/WritetimeOrTTLSelector.java | 53 +++++++----- .../cql3/statements/SelectStatement.java | 16 ++-- .../selection/SelectorSerializationTest.java | 5 +- .../validation/entities/TimestampTest.java | 83 +++++++++++++++++++ 18 files changed, 283 insertions(+), 67 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 0fc3cd431c54..76bd58e977ac 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Add new CQL function maxWritetime (CASSANDRA-17425) * Add guardrail for ALTER TABLE ADD / DROP / REMOVE column operations (CASSANDRA-17495) * Rename DisableFlag class to EnableFlag on guardrails (CASSANDRA-17544) Merged from 4.1: diff --git a/NEWS.txt b/NEWS.txt index d42d03afe6d2..4d9b9f70b326 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -57,6 +57,9 @@ using the provided 'sstableupgrade' tool. New features ------------ + - Added a new CQL function, maxwritetime. It shows the largest unix timestamp that the data was written, similar to + its sibling CQL function, writetime. Unlike writetime, maxwritetime can be applied to multi-cell data types, e.g. + non-frozen collections and UDT, and returns the largest timestamp. One should not to use it when upgrading to 4.2. - New Guardrails added: - Whether ALTER TABLE commands are allowed to mutate columns diff --git a/doc/cql3/CQL.textile b/doc/cql3/CQL.textile index 8bedf19a7801..5fef1a9a2745 100644 --- a/doc/cql3/CQL.textile +++ b/doc/cql3/CQL.textile @@ -1083,6 +1083,7 @@ bc(syntax).. ::= | | WRITETIME '(' ')' + | MAXWRITETIME '(' ')' | COUNT '(' '*' ')' | TTL '(' ')' | CAST '(' AS ')' @@ -1131,7 +1132,7 @@ h4(#selectSelection). @@ The @@ determines which columns needs to be queried and returned in the result-set. It consists of either the comma-separated list of or the wildcard character (@*@) to select all the columns defined for the table. Please note that for wildcard @SELECT@ queries the order of columns returned is not specified and is not guaranteed to be stable between Cassandra versions. -A @@ is either a column name to retrieve or a @@ of one or more @@s. The function allowed are the same as for @@ and are described in the "function section":#functions. In addition to these generic functions, the @WRITETIME@ (resp. @TTL@) function allows to select the timestamp of when the column was inserted (resp. the time to live (in seconds) for the column (or null if the column has no expiration set)) and the "@CAST@":#castFun function can be used to convert one data type to another. +A @@ is either a column name to retrieve or a @@ of one or more @@s. The function allowed are the same as for @@ and are described in the "function section":#functions. In addition to these generic functions, the @WRITETIME@ and @MAXWRITETIME@ (resp. @TTL@) function allows to select the timestamp of when the column was inserted (resp. the time to live (in seconds) for the column (or null if the column has no expiration set)) and the "@CAST@":#castFun function can be used to convert one data type to another. Additionally, individual values of maps and sets can be selected using @[ ]@. For maps, this will return the value corresponding to the key, if such entry exists. For sets, this will return the key that is selected if it exists and is thus mainly a way to check element existence. It is also possible to select a slice of a set or map with @[ ... @], where both bound can be omitted. @@ -2052,7 +2053,7 @@ A number of functions are provided to "convert" the native types into binary dat h2(#aggregates). Aggregates Aggregate functions work on a set of rows. They receive values for each row and returns one value for the whole set. -If @normal@ columns, @scalar functions@, @UDT@ fields, @writetime@ or @ttl@ are selected together with aggregate functions, the values returned for them will be the ones of the first row matching the query. +If @normal@ columns, @scalar functions@, @UDT@ fields, @writetime@, @maxwritetime@ or @ttl@ are selected together with aggregate functions, the values returned for them will be the ones of the first row matching the query. CQL3 distinguishes between built-in aggregates (so called 'native aggregates') and "user-defined aggregates":#udas. CQL3 includes several native aggregates, described below: @@ -2433,6 +2434,7 @@ CQL distinguishes between _reserved_ and _non-reserved_ keywords. Reserved keywo | @WHERE@ | yes | | @WITH@ | yes | | @WRITETIME@ | no | +| @MAXWRITETIME@ | no | h2(#appendixB). Appendix B: CQL Reserved Types diff --git a/doc/modules/cassandra/pages/cql/appendices.adoc b/doc/modules/cassandra/pages/cql/appendices.adoc index 7e17266a3f7e..544afc009f15 100644 --- a/doc/modules/cassandra/pages/cql/appendices.adoc +++ b/doc/modules/cassandra/pages/cql/appendices.adoc @@ -139,6 +139,7 @@ or not. |`WHERE` |yes |`WITH` |yes |`WRITETIME` |no +|`MAXWRITETIME` |no |=== == Appendix B: CQL Reserved Types diff --git a/doc/modules/cassandra/pages/cql/cql_singlefile.adoc b/doc/modules/cassandra/pages/cql/cql_singlefile.adoc index 4fe8c10a65b5..207856a6d689 100644 --- a/doc/modules/cassandra/pages/cql/cql_singlefile.adoc +++ b/doc/modules/cassandra/pages/cql/cql_singlefile.adoc @@ -1645,6 +1645,7 @@ FROM + ::= + | + | WRITETIME `(' `)' + +| MAXWRITETIME `(' `)' + | COUNT `(' `*' `)' + | TTL `(' `)' + | CAST `(' AS `)' + @@ -1706,8 +1707,8 @@ be stable between Cassandra versions. A `` is either a column name to retrieve or a `` of one or more ``s. The function allowed are the same as for `` and are described in the link:#functions[function section]. In addition -to these generic functions, the `WRITETIME` (resp. `TTL`) function -allows to select the timestamp of when the column was inserted (resp. +to these generic functions, the `WRITETIME` and `MAXWRITETIME` (resp. `TTL`) +function allows to select the timestamp of when the column was inserted (resp. the time to live (in seconds) for the column (or null if the column has no expiration set)) and the link:#castFun[`CAST`] function can be used to convert one data type to another. @@ -3148,8 +3149,8 @@ is `0x0000000000000003` and `blobAsBigint(0x0000000000000003)` is `3`. Aggregate functions work on a set of rows. They receive values for each row and returns one value for the whole set. + -If `normal` columns, `scalar functions`, `UDT` fields, `writetime` or -`ttl` are selected together with aggregate functions, the values +If `normal` columns, `scalar functions`, `UDT` fields, `writetime`, `maxwritetime` +or `ttl` are selected together with aggregate functions, the values returned for them will be the ones of the first row matching the query. CQL3 distinguishes between built-in aggregates (so called `native diff --git a/doc/modules/cassandra/pages/cql/dml.adoc b/doc/modules/cassandra/pages/cql/dml.adoc index 8a4df2fecb30..7989d4447a3d 100644 --- a/doc/modules/cassandra/pages/cql/dml.adoc +++ b/doc/modules/cassandra/pages/cql/dml.adoc @@ -75,14 +75,17 @@ You must use the orignal column name instead. ==== [[writetime-and-ttl-function]] -==== `WRITETIME` and `TTL` function +==== `WRITETIME`, `MAXWRITETIME` and `TTL` function -Selection supports two special functions that aren't allowed anywhere -else: `WRITETIME` and `TTL`. -Both functions take only one argument, a column name. +Selection supports three special functions that aren't allowed anywhere +else: `WRITETIME`, `MAXWRITETIME` and `TTL`. +All functions take only one argument, a column name. These functions retrieve meta-information that is stored internally for each column: -* `WRITETIME` stores the timestamp of the value of the column +* `WRITETIME` stores the timestamp of the value of the column. Note that this function cannot be applied to non-frozen collection +and UDT. +* `MAXWRITETIME` stores the largest timestamp of the value of the column. For non-collection and non-UDT columns, `MAXWRITETIME` +is equivalent to `WRITETIME`. In the other cases, it returns the largest timestamp of the values in the column. * `TTL` stores the remaining time to live (in seconds) for the value of the column if it is set to expire; otherwise the value is `null`. [[where-clause]] diff --git a/pylib/cqlshlib/cql3handling.py b/pylib/cqlshlib/cql3handling.py index 7de95cf24c80..7e123bd67a89 100644 --- a/pylib/cqlshlib/cql3handling.py +++ b/pylib/cqlshlib/cql3handling.py @@ -731,6 +731,7 @@ def working_on_keyspace(ctxt): ::= [colname]= ( "[" ( ( ".." "]" )? | ".." ) )? | | "WRITETIME" "(" [colname]= ")" + | "MAXWRITETIME" "(" [colname]= ")" | "TTL" "(" [colname]= ")" | "COUNT" "(" star=( "*" | "1" ) ")" | "CAST" "(" "AS" ")" diff --git a/src/antlr/Lexer.g b/src/antlr/Lexer.g index 34c7e2ed2fdf..84dd0361f111 100644 --- a/src/antlr/Lexer.g +++ b/src/antlr/Lexer.g @@ -178,6 +178,7 @@ K_VARINT: V A R I N T; K_TIMEUUID: T I M E U U I D; K_TOKEN: T O K E N; K_WRITETIME: W R I T E T I M E; +K_MAXWRITETIME:M A X W R I T E T I M E; K_DATE: D A T E; K_TIME: T I M E; diff --git a/src/antlr/Parser.g b/src/antlr/Parser.g index d061ee4df35a..2643e0a6b567 100644 --- a/src/antlr/Parser.g +++ b/src/antlr/Parser.g @@ -415,8 +415,9 @@ simpleUnaliasedSelector returns [Selectable.Raw s] selectionFunction returns [Selectable.Raw s] : K_COUNT '(' '\*' ')' { $s = Selectable.WithFunction.Raw.newCountRowsFunction(); } - | K_WRITETIME '(' c=sident ')' { $s = new Selectable.WritetimeOrTTL.Raw(c, true); } - | K_TTL '(' c=sident ')' { $s = new Selectable.WritetimeOrTTL.Raw(c, false); } + | K_MAXWRITETIME '(' c=sident ')' { $s = new Selectable.WritetimeOrTTL.Raw(c, Selectable.WritetimeOrTTL.Kind.MAX_WRITE_TIME); } + | K_WRITETIME '(' c=sident ')' { $s = new Selectable.WritetimeOrTTL.Raw(c, Selectable.WritetimeOrTTL.Kind.WRITE_TIME); } + | K_TTL '(' c=sident ')' { $s = new Selectable.WritetimeOrTTL.Raw(c, Selectable.WritetimeOrTTL.Kind.TTL); } | K_CAST '(' sn=unaliasedSelector K_AS t=native_type ')' {$s = new Selectable.WithCast.Raw(sn, t);} | f=functionName args=selectionFunctionArgs { $s = new Selectable.WithFunction.Raw(f, args); } ; @@ -1870,7 +1871,7 @@ non_type_ident returns [ColumnIdentifier id] unreserved_keyword returns [String str] : u=unreserved_function_keyword { $str = u; } - | k=(K_TTL | K_COUNT | K_WRITETIME | K_KEY | K_CAST | K_JSON | K_DISTINCT) { $str = $k.text; } + | k=(K_TTL | K_COUNT | K_WRITETIME | K_MAXWRITETIME | K_KEY | K_CAST | K_JSON | K_DISTINCT) { $str = $k.text; } ; unreserved_function_keyword returns [String str] diff --git a/src/java/org/apache/cassandra/cql3/selection/ResultSetBuilder.java b/src/java/org/apache/cassandra/cql3/selection/ResultSetBuilder.java index 22566b26d792..3e652dfeb4ec 100644 --- a/src/java/org/apache/cassandra/cql3/selection/ResultSetBuilder.java +++ b/src/java/org/apache/cassandra/cql3/selection/ResultSetBuilder.java @@ -19,7 +19,10 @@ import java.nio.ByteBuffer; import java.util.ArrayList; +import java.util.Iterator; import java.util.List; +import java.util.function.BiFunction; +import java.util.function.Function; import org.apache.cassandra.cql3.ResultSet; import org.apache.cassandra.cql3.ResultSet.ResultMetadata; @@ -28,6 +31,7 @@ import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.aggregation.GroupMaker; import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.ComplexColumnData; public final class ResultSetBuilder { @@ -98,6 +102,27 @@ public void add(ByteBuffer v) inputRow.add(v); } + public void add(ComplexColumnData complexColumnData, Function>, ByteBuffer> serializer) + { + if (complexColumnData == null) + { + inputRow.add(null); + return; + } + + long timestamp = -1L; + if (selectors.collectMaxTimestamps()) + { + Iterator> cells = complexColumnData.iterator(); + while (cells.hasNext()) + { + timestamp = Math.max(timestamp, cells.next().timestamp()); + } + } + + inputRow.add(serializer.apply(complexColumnData.iterator()), timestamp, -1); + } + public void add(Cell c, int nowInSec) { inputRow.add(c, nowInSec); diff --git a/src/java/org/apache/cassandra/cql3/selection/Selectable.java b/src/java/org/apache/cassandra/cql3/selection/Selectable.java index de5360f52529..afa86b12a09e 100644 --- a/src/java/org/apache/cassandra/cql3/selection/Selectable.java +++ b/src/java/org/apache/cassandra/cql3/selection/Selectable.java @@ -222,19 +222,46 @@ public Selectable prepare(TableMetadata table) public static class WritetimeOrTTL implements Selectable { + // The order of the variants in the Kind enum matters as they are used in ser/deser + public enum Kind + { + TTL("ttl", Int32Type.instance), + WRITE_TIME("writetime", LongType.instance), + MAX_WRITE_TIME("maxwritetime", LongType.instance); // maxwritetime is available after Cassandra 4.1 (exclusive) + + public final String name; + public final AbstractType returnType; + + public static Kind fromOrdinal(int ordinal) + { + return values()[ordinal]; + } + + Kind(String name, AbstractType returnType) + { + this.name = name; + this.returnType = returnType; + } + + public boolean allowedForMultiCell() + { + return this == MAX_WRITE_TIME; + } + } + public final ColumnMetadata column; - public final boolean isWritetime; + public final Kind kind; - public WritetimeOrTTL(ColumnMetadata column, boolean isWritetime) + public WritetimeOrTTL(ColumnMetadata column, Kind kind) { this.column = column; - this.isWritetime = isWritetime; + this.kind = kind; } @Override public String toString() { - return (isWritetime ? "writetime" : "ttl") + "(" + column.name + ")"; + return kind.name + "(" + column.name + ")"; } public Selector.Factory newSelectorFactory(TableMetadata table, @@ -245,18 +272,20 @@ public Selector.Factory newSelectorFactory(TableMetadata table, if (column.isPrimaryKeyColumn()) throw new InvalidRequestException( String.format("Cannot use selection function %s on PRIMARY KEY part %s", - isWritetime ? "writeTime" : "ttl", + kind.name, column.name)); - if (column.type.isCollection()) + + // only maxwritetime is allowed for collection + if (column.type.isCollection() && !kind.allowedForMultiCell()) throw new InvalidRequestException(String.format("Cannot use selection function %s on collections", - isWritetime ? "writeTime" : "ttl")); + kind.name)); - return WritetimeOrTTLSelector.newFactory(column, addAndGetIndex(column, defs), isWritetime); + return WritetimeOrTTLSelector.newFactory(column, addAndGetIndex(column, defs), kind); } public AbstractType getExactTypeIfKnown(String keyspace) { - return isWritetime ? LongType.instance : Int32Type.instance; + return kind.returnType; } @Override @@ -268,18 +297,18 @@ public boolean selectColumns(Predicate predicate) public static class Raw implements Selectable.Raw { private final Selectable.RawIdentifier id; - private final boolean isWritetime; + private final Kind kind; - public Raw(Selectable.RawIdentifier id, boolean isWritetime) + public Raw(Selectable.RawIdentifier id, Kind kind) { this.id = id; - this.isWritetime = isWritetime; + this.kind = kind; } @Override public WritetimeOrTTL prepare(TableMetadata table) { - return new WritetimeOrTTL(id.prepare(table), isWritetime); + return new WritetimeOrTTL(id.prepare(table), kind); } } } diff --git a/src/java/org/apache/cassandra/cql3/selection/Selection.java b/src/java/org/apache/cassandra/cql3/selection/Selection.java index f07184a59986..2f41192c373e 100644 --- a/src/java/org/apache/cassandra/cql3/selection/Selection.java +++ b/src/java/org/apache/cassandra/cql3/selection/Selection.java @@ -376,6 +376,12 @@ public static interface Selectors */ public boolean collectTimestamps(); + /** + * Checks if one of the selectors collects maxTimestamps. + * @return {@code true} if one of the selectors collect maxTimestamps, {@code false} otherwise. + */ + public boolean collectMaxTimestamps(); + /** * Adds the current row of the specified ResultSetBuilder. * @@ -506,6 +512,11 @@ public boolean collectTimestamps() return false; } + @Override + public boolean collectMaxTimestamps() { + return false; + } + @Override public ColumnFilter getColumnFilter() { @@ -521,6 +532,7 @@ private static class SelectionWithProcessing extends Selection { private final SelectorFactories factories; private final boolean collectTimestamps; + private final boolean collectMaxTimestamps; private final boolean collectTTLs; public SelectionWithProcessing(TableMetadata table, @@ -541,7 +553,8 @@ public SelectionWithProcessing(TableMetadata table, this.factories = factories; this.collectTimestamps = factories.containsWritetimeSelectorFactory(); - this.collectTTLs = factories.containsTTLSelectorFactory();; + this.collectMaxTimestamps = factories.containsMaxWritetimeSelectorFactory(); + this.collectTTLs = factories.containsTTLSelectorFactory(); for (ColumnMetadata orderingColumn : orderingColumns) { @@ -619,7 +632,12 @@ public boolean collectTTLs() @Override public boolean collectTimestamps() { - return collectTimestamps; + return collectTimestamps || collectMaxTimestamps; + } + + @Override + public boolean collectMaxTimestamps() { + return collectMaxTimestamps; } @Override diff --git a/src/java/org/apache/cassandra/cql3/selection/Selector.java b/src/java/org/apache/cassandra/cql3/selection/Selector.java index 463382d584c7..8226c2d5d26a 100644 --- a/src/java/org/apache/cassandra/cql3/selection/Selector.java +++ b/src/java/org/apache/cassandra/cql3/selection/Selector.java @@ -20,8 +20,10 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.Arrays; +import java.util.Iterator; import java.util.List; +import org.apache.cassandra.db.rows.ComplexColumnData; import org.apache.cassandra.schema.CQLTypeParser; import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.Schema; @@ -71,7 +73,7 @@ protected final AbstractType readType(KeyspaceMetadata keyspace, DataInputPlu /** * The Selector kinds. */ - public static enum Kind + public enum Kind { SIMPLE_SELECTOR(SimpleSelector.deserializer), TERM_SELECTOR(TermSelector.deserializer), @@ -151,6 +153,17 @@ public boolean isWritetimeSelectorFactory() return false; } + /** + * Checks if this factory creates maxwritetime selector instances. + * + * @return true if this factory creates maxwritetime selectors instances, + * false otherwise + */ + public boolean isMaxWritetimeSelectorFactory() + { + return false; + } + /** * Checks if this factory creates TTL selectors instances. * @@ -321,14 +334,19 @@ public InputRow(int size, boolean collectTimestamps, boolean collectTTLs) } public void add(ByteBuffer v) + { + add(v, Long.MIN_VALUE, -1); + } + + public void add(ByteBuffer v, long timestamp, int ttl) { values[index] = v; if (timestamps != null) - timestamps[index] = Long.MIN_VALUE; + timestamps[index] = timestamp; if (ttls != null) - ttls[index] = -1; + ttls[index] = ttl; index++; } diff --git a/src/java/org/apache/cassandra/cql3/selection/SelectorFactories.java b/src/java/org/apache/cassandra/cql3/selection/SelectorFactories.java index 7f4bcb301244..1b275a8c9048 100644 --- a/src/java/org/apache/cassandra/cql3/selection/SelectorFactories.java +++ b/src/java/org/apache/cassandra/cql3/selection/SelectorFactories.java @@ -32,22 +32,27 @@ import org.apache.cassandra.exceptions.InvalidRequestException; /** - * A set of Selector factories. + * A set of {@code Selector} factories. */ final class SelectorFactories implements Iterable { /** - * The Selector factories. + * The {@code Selector} factories. */ private final List factories; /** - * true if one of the factory creates writetime selectors. + * {@code true} if one of the factories creates writetime selectors. */ private boolean containsWritetimeFactory; /** - * true if one of the factory creates TTL selectors. + * {@code true} if one of the factories creates maxWritetime selectors. + */ + private boolean containsMaxWritetimeFactory; + + /** + * {@code true} if one of the factories creates TTL selectors. */ private boolean containsTTLFactory; @@ -96,6 +101,7 @@ private SelectorFactories(List selectables, Factory factory = selectable.newSelectorFactory(table, expectedType, defs, boundNames); containsWritetimeFactory |= factory.isWritetimeSelectorFactory(); containsTTLFactory |= factory.isTTLSelectorFactory(); + containsMaxWritetimeFactory |= factory.isMaxWritetimeSelectorFactory(); if (factory.isAggregateSelectorFactory()) ++numberOfAggregateFactories; factories.add(factory); @@ -165,6 +171,17 @@ public boolean containsWritetimeSelectorFactory() return containsWritetimeFactory; } + /** + * Checks if this {@code SelectorFactories} contains at least one factory for maxWritetime selectors. + * + * @return {@link true} if this {@link SelectorFactories} contains at least one factory for maxWritetime + * selectors, {@link false} otherwise. + */ + public boolean containsMaxWritetimeSelectorFactory() + { + return containsMaxWritetimeFactory; + } + /** * Checks if this SelectorFactories contains at least one factory for TTL selectors. * diff --git a/src/java/org/apache/cassandra/cql3/selection/WritetimeOrTTLSelector.java b/src/java/org/apache/cassandra/cql3/selection/WritetimeOrTTLSelector.java index 2c56f5ced2ed..29ebfbbdf6ad 100644 --- a/src/java/org/apache/cassandra/cql3/selection/WritetimeOrTTLSelector.java +++ b/src/java/org/apache/cassandra/cql3/selection/WritetimeOrTTLSelector.java @@ -29,8 +29,6 @@ import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.db.marshal.AbstractType; -import org.apache.cassandra.db.marshal.Int32Type; -import org.apache.cassandra.db.marshal.LongType; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.transport.ProtocolVersion; @@ -45,29 +43,30 @@ protected Selector deserialize(DataInputPlus in, int version, TableMetadata meta ByteBuffer columnName = ByteBufferUtil.readWithVIntLength(in); ColumnMetadata column = metadata.getColumn(columnName); int idx = in.readInt(); - boolean isWritetime = in.readBoolean(); - return new WritetimeOrTTLSelector(column, idx, isWritetime); + int ordinal = in.readByte(); + Selectable.WritetimeOrTTL.Kind k = Selectable.WritetimeOrTTL.Kind.fromOrdinal(ordinal); + return new WritetimeOrTTLSelector(column, idx, k); } }; private final ColumnMetadata column; private final int idx; - private final boolean isWritetime; + private final Selectable.WritetimeOrTTL.Kind kind; private ByteBuffer current; private boolean isSet; - public static Factory newFactory(final ColumnMetadata def, final int idx, final boolean isWritetime) + public static Factory newFactory(final ColumnMetadata def, final int idx, final Selectable.WritetimeOrTTL.Kind kind) { return new Factory() { protected String getColumnName() { - return String.format("%s(%s)", isWritetime ? "writetime" : "ttl", def.name.toString()); + return String.format("%s(%s)", kind.name, def.name.toString()); } protected AbstractType getReturnType() { - return isWritetime ? LongType.instance : Int32Type.instance; + return kind.returnType; } protected void addColumnMapping(SelectionColumnMapping mapping, ColumnSpecification resultsColumn) @@ -77,17 +76,25 @@ protected void addColumnMapping(SelectionColumnMapping mapping, ColumnSpecificat public Selector newInstance(QueryOptions options) { - return new WritetimeOrTTLSelector(def, idx, isWritetime); + return new WritetimeOrTTLSelector(def, idx, kind); } + @Override public boolean isWritetimeSelectorFactory() { - return isWritetime; + return kind != Selectable.WritetimeOrTTL.Kind.TTL; } + @Override public boolean isTTLSelectorFactory() { - return !isWritetime; + return kind == Selectable.WritetimeOrTTL.Kind.TTL; + } + + @Override + public boolean isMaxWritetimeSelectorFactory() + { + return kind == Selectable.WritetimeOrTTL.Kind.MAX_WRITE_TIME; } public boolean areAllFetchedColumnsKnown() @@ -114,15 +121,15 @@ public void addInput(ProtocolVersion protocolVersion, InputRow input) isSet = true; - if (isWritetime) + if (kind == Selectable.WritetimeOrTTL.Kind.TTL) { - long ts = input.getTimestamp(idx); - current = ts != Long.MIN_VALUE ? ByteBufferUtil.bytes(ts) : null; + int ttl = input.getTtl(idx); + current = ttl > 0 ? ByteBufferUtil.bytes(ttl) : null; } else { - int ttl = input.getTtl(idx); - current = ttl > 0 ? ByteBufferUtil.bytes(ttl) : null; + long ts = input.getTimestamp(idx); + current = ts != Long.MIN_VALUE ? ByteBufferUtil.bytes(ts) : null; } } @@ -139,7 +146,7 @@ public void reset() public AbstractType getType() { - return isWritetime ? LongType.instance : Int32Type.instance; + return kind.returnType; } @Override @@ -148,12 +155,12 @@ public String toString() return column.name.toString(); } - private WritetimeOrTTLSelector(ColumnMetadata column, int idx, boolean isWritetime) + private WritetimeOrTTLSelector(ColumnMetadata column, int idx, Selectable.WritetimeOrTTL.Kind kind) { super(Kind.WRITETIME_OR_TTL_SELECTOR); this.column = column; this.idx = idx; - this.isWritetime = isWritetime; + this.kind = kind; } @Override @@ -169,13 +176,13 @@ public boolean equals(Object o) return Objects.equal(column, s.column) && Objects.equal(idx, s.idx) - && Objects.equal(isWritetime, s.isWritetime); + && kind == s.kind; } @Override public int hashCode() { - return Objects.hashCode(column, idx, isWritetime); + return Objects.hashCode(column, idx, kind); } @Override @@ -183,7 +190,7 @@ protected int serializedSize(int version) { return ByteBufferUtil.serializedSizeWithVIntLength(column.name.bytes) + TypeSizes.sizeof(idx) - + TypeSizes.sizeof(isWritetime); + + TypeSizes.sizeofUnsignedVInt(kind.ordinal()); } @Override @@ -191,6 +198,6 @@ protected void serialize(DataOutputPlus out, int version) throws IOException { ByteBufferUtil.writeWithVIntLength(column.name.bytes, out); out.writeInt(idx); - out.writeBoolean(isWritetime); + out.writeByte(kind.ordinal()); } } diff --git a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java index 030b4cd785ea..0d43313e532f 100644 --- a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java @@ -1039,12 +1039,16 @@ private static void addValue(ResultSetBuilder result, ColumnMetadata def, Row ro { assert def.type.isMultiCell(); ComplexColumnData complexData = row.getComplexColumnData(def); - if (complexData == null) - result.add(null); - else if (def.type.isCollection()) - result.add(((CollectionType) def.type).serializeForNativeProtocol(complexData.iterator(), protocolVersion)); - else - result.add(((UserType) def.type).serializeForNativeProtocol(complexData.iterator(), protocolVersion)); + result.add(complexData, iterator -> { + if (def.type.isCollection()) + { + return ((CollectionType) def.type).serializeForNativeProtocol(iterator, protocolVersion); + } + else + { + return ((UserType) def.type).serializeForNativeProtocol(iterator, protocolVersion); + } + }); } else { diff --git a/test/unit/org/apache/cassandra/cql3/selection/SelectorSerializationTest.java b/test/unit/org/apache/cassandra/cql3/selection/SelectorSerializationTest.java index ee4dd356e1ed..4eadb95d6d26 100644 --- a/test/unit/org/apache/cassandra/cql3/selection/SelectorSerializationTest.java +++ b/test/unit/org/apache/cassandra/cql3/selection/SelectorSerializationTest.java @@ -60,8 +60,9 @@ public void testSerDes() throws IOException checkSerialization(table.getColumn(new ColumnIdentifier("c1", false)), table); // Test WritetimeOrTTLSelector serialization - checkSerialization(new Selectable.WritetimeOrTTL(table.getColumn(new ColumnIdentifier("v", false)), true), table); - checkSerialization(new Selectable.WritetimeOrTTL(table.getColumn(new ColumnIdentifier("v", false)), false), table); + checkSerialization(new Selectable.WritetimeOrTTL(table.getColumn(new ColumnIdentifier("v", false)), Selectable.WritetimeOrTTL.Kind.WRITE_TIME), table); + checkSerialization(new Selectable.WritetimeOrTTL(table.getColumn(new ColumnIdentifier("v", false)), Selectable.WritetimeOrTTL.Kind.TTL), table); + checkSerialization(new Selectable.WritetimeOrTTL(table.getColumn(new ColumnIdentifier("v", false)), Selectable.WritetimeOrTTL.Kind.MAX_WRITE_TIME), table); // Test ListSelector serialization checkSerialization(new Selectable.WithList(asList(table.getColumn(new ColumnIdentifier("v", false)), diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/TimestampTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/TimestampTest.java index 13090a6b6271..7c6cd8f9eafa 100644 --- a/test/unit/org/apache/cassandra/cql3/validation/entities/TimestampTest.java +++ b/test/unit/org/apache/cassandra/cql3/validation/entities/TimestampTest.java @@ -17,11 +17,16 @@ */ package org.apache.cassandra.cql3.validation.entities; +import java.util.Arrays; +import java.util.List; + import org.junit.Test; import org.junit.Assert; import org.apache.cassandra.cql3.CQLTester; +import org.apache.cassandra.utils.Pair; +import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; @@ -97,6 +102,84 @@ public void testTimestampTTL() throws Throwable row(1, null, null)); } + private void setupSchemaForMaxTimestamp() + { + String myType = createType("CREATE TYPE %s (a int, b int)"); + createTable("CREATE TABLE %s (k int PRIMARY KEY, a text, " + + "l list, fl frozen>," + + "s set, fs frozen>," + + "m map, fm frozen>," + + "t " + myType + ", ft frozen<" + myType + ">)"); + } + + @Test + public void testCallMaxTimestampOnEmptyCollectionReturnsNull() throws Throwable + { + setupSchemaForMaxTimestamp(); + + execute("INSERT INTO %s (k) VALUES (1)"); + Object[][] res = getRows(execute("SELECT maxwritetime(a), maxwritetime(l), maxwritetime(fl)," + + "maxwritetime(s), maxwritetime(fs), maxwritetime(m), maxwritetime(fm)," + + "maxwritetime(t), maxwritetime(ft) FROM %s WHERE k=1")); + + assertEquals(1, res.length); + for (Object v : res[0]) + { + assertNull("All the multi-cell data are empty (we did not insert), calling maxwritetime should return null", + v); + } + } + + @Test + public void testMaxTimestamp() throws Throwable + { + setupSchemaForMaxTimestamp(); + + execute("INSERT INTO %s (k, a, l, fl, s, fs, m, fm, t, ft) VALUES " + + "(1, 'test', [1], [2], {1}, {2}, {1 : 'a'}, {2 : 'b'}, {a : 1, b : 1 }, {a : 2, b : 2}) USING TIMESTAMP 1"); + + // enumerate through all multi-cell types and make sure maxwritetime reflects the expected result + testMaxTimestampWithColumnUpdate(Arrays.asList( + Pair.create(1, "UPDATE %s USING TIMESTAMP 10 SET l = l + [10] WHERE k = 1"), + Pair.create(3, "UPDATE %s USING TIMESTAMP 11 SET s = s + {10} WHERE k = 1"), + Pair.create(5, "UPDATE %s USING TIMESTAMP 12 SET m = m + {10 : 'c'} WHERE k = 1"), + Pair.create(7, "UPDATE %s USING TIMESTAMP 13 SET t.a = 10 WHERE k = 1") + )); + } + + private void testMaxTimestampWithColumnUpdate(List> updateStatements) throws Throwable + { + for (Pair update : updateStatements) + { + int fieldPos = update.left(); + String statement = update.right(); + + // run the update statement and update the timestamp of the column + execute(statement); + + Object[][] res = getRows(execute("SELECT maxwritetime(a), maxwritetime(l), maxwritetime(fl)," + + "maxwritetime(s), maxwritetime(fs), maxwritetime(m), maxwritetime(fm)," + + "maxwritetime(t), maxwritetime(ft) FROM %s WHERE k=1")); + Assert.assertEquals(1, res.length); + Assert.assertEquals("maxwritetime should work on both single cell and complex columns", + 9, res[0].length); + for (Object ts : res[0]) + { + assertTrue(ts instanceof Long); // all the result fields are timestamps + } + + long updatedTs = (long) res[0][fieldPos]; // maxwritetime the updated column + + for (int i = 0; i < res[0].length; i++) + { + long ts = (long) res[0][i]; + if (i != fieldPos) + assertTrue("The updated column should have a large maxwritetime since it is updated later", + ts < updatedTs); + } + } + } + /** * Migrated from cql_tests.py:TestCQL.invalid_custom_timestamp_test() */ From ed3901823a5fe9f8838d8b592a1b7703b12e810b Mon Sep 17 00:00:00 2001 From: Jyothsna Konisa Date: Tue, 24 May 2022 10:21:16 -0700 Subject: [PATCH 006/159] Adding support for TLS client authentication for internode communication patch by Jyothsna Konisa; reviewed by Bernardo Botella, Francisco Guerrero, Jon Meredith, Maulin Vasavada, Yifan Cai for CASSANDRA-17513 --- CHANGES.txt | 1 + conf/cassandra.yaml | 6 + .../cassandra/config/EncryptionOptions.java | 112 ++++++++---- .../net/OutboundConnectionSettings.java | 8 +- .../security/AbstractSslContextFactory.java | 11 +- .../security/DisableSslContextFactory.java | 12 ++ .../security/FileBasedSslContextFactory.java | 125 +++++++++----- .../security/ISslContextFactory.java | 10 ++ .../security/PEMBasedSslContextFactory.java | 160 ++++++++++++------ .../conf/cassandra_ssl_test_outbound.keystore | Bin 0 -> 2286 bytes .../config/EncryptionOptionsTest.java | 2 +- .../DefaultSslContextFactoryTest.java | 86 ++++++++-- .../PEMBasedSslContextFactoryTest.java | 45 +++-- .../cassandra/security/SSLFactoryTest.java | 98 ++++++++++- 14 files changed, 513 insertions(+), 163 deletions(-) create mode 100644 test/conf/cassandra_ssl_test_outbound.keystore diff --git a/CHANGES.txt b/CHANGES.txt index f0041514bb3a..0f64ce2612c1 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Adding support for TLS client authentication for internode communication (CASSANDRA-17513) * Add new CQL function maxWritetime (CASSANDRA-17425) * Add guardrail for ALTER TABLE ADD / DROP / REMOVE column operations (CASSANDRA-17495) * Rename DisableFlag class to EnableFlag on guardrails (CASSANDRA-17544) diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml index d11e5e64bd5b..409110a02430 100644 --- a/conf/cassandra.yaml +++ b/conf/cassandra.yaml @@ -1316,6 +1316,12 @@ server_encryption_options: # Set to a valid keystore if internode_encryption is dc, rack or all keystore: conf/.keystore keystore_password: cassandra + # During internode mTLS authentication, inbound connections (acting as servers) use keystore, keystore_password + # containing server certificate to create SSLContext and + # outbound connections (acting as clients) use outbound_keystore & outbound_keystore_password with client certificates + # to create SSLContext. By default, outbound_keystore is the same as keystore indicating mTLS is not enabled. +# outbound_keystore: conf/.keystore +# outbound_keystore_password: cassandra # Verify peer server certificates require_client_auth: false # Set to a valid trustore if require_client_auth is true diff --git a/src/java/org/apache/cassandra/config/EncryptionOptions.java b/src/java/org/apache/cassandra/config/EncryptionOptions.java index eb6724f96d87..0ab653f08885 100644 --- a/src/java/org/apache/cassandra/config/EncryptionOptions.java +++ b/src/java/org/apache/cassandra/config/EncryptionOptions.java @@ -25,20 +25,14 @@ import java.util.Objects; import java.util.Set; -import javax.net.ssl.KeyManagerFactory; -import javax.net.ssl.SSLException; -import javax.net.ssl.TrustManagerFactory; - import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; import org.slf4j.Logger; import org.slf4j.LoggerFactory; -import com.fasterxml.jackson.annotation.JsonIgnore; import org.apache.cassandra.locator.IEndpointSnitch; import org.apache.cassandra.locator.InetAddressAndPort; -import org.apache.cassandra.security.AbstractSslContextFactory; import org.apache.cassandra.security.DisableSslContextFactory; import org.apache.cassandra.security.ISslContextFactory; import org.apache.cassandra.utils.FBUtilities; @@ -111,6 +105,8 @@ public enum ConfigKey { KEYSTORE("keystore"), KEYSTORE_PASSWORD("keystore_password"), + OUTBOUND_KEYSTORE("outbound_keystore"), + OUTBOUND_KEYSTORE_PASSWORD("outbound_keystore_password"), TRUSTSTORE("truststore"), TRUSTSTORE_PASSWORD("truststore_password"), CIPHER_SUITES("cipher_suites"), @@ -263,11 +259,8 @@ private void prepareSslContextFactoryParameterizedKeys(Map sslCon } } - private void initializeSslContextFactory() + protected void fillSslContextParams(Map sslContextFactoryParameters) { - Map sslContextFactoryParameters = new HashMap<>(); - prepareSslContextFactoryParameterizedKeys(sslContextFactoryParameters); - /* * Copy all configs to the Map to pass it on to the ISslContextFactory's implementation */ @@ -284,6 +277,13 @@ private void initializeSslContextFactory() putSslContextFactoryParameter(sslContextFactoryParameters, ConfigKey.REQUIRE_ENDPOINT_VERIFICATION, this.require_endpoint_verification); putSslContextFactoryParameter(sslContextFactoryParameters, ConfigKey.ENABLED, this.enabled); putSslContextFactoryParameter(sslContextFactoryParameters, ConfigKey.OPTIONAL, this.optional); + } + + private void initializeSslContextFactory() + { + Map sslContextFactoryParameters = new HashMap<>(); + prepareSslContextFactoryParameterizedKeys(sslContextFactoryParameters); + fillSslContextParams(sslContextFactoryParameters); if (CassandraRelevantProperties.TEST_JVM_DTEST_DISABLE_SSL.getBoolean()) { @@ -296,8 +296,7 @@ private void initializeSslContextFactory() } } - private void putSslContextFactoryParameter(Map existingParameters, ConfigKey configKey, - Object value) + protected static void putSslContextFactoryParameter(Map existingParameters, ConfigKey configKey, Object value) { if (value != null) { existingParameters.put(configKey.getKeyName(), value); @@ -608,15 +607,20 @@ public enum InternodeEncryption public final InternodeEncryption internode_encryption; @Replaces(oldName = "enable_legacy_ssl_storage_port", deprecated = true) public final boolean legacy_ssl_storage_port_enabled; + public final String outbound_keystore; + public final String outbound_keystore_password; public ServerEncryptionOptions() { this.internode_encryption = InternodeEncryption.none; this.legacy_ssl_storage_port_enabled = false; + this.outbound_keystore = null; + this.outbound_keystore_password = null; } public ServerEncryptionOptions(ParameterizedClass sslContextFactoryClass, String keystore, - String keystore_password, String truststore, String truststore_password, + String keystore_password,String outbound_keystore, + String outbound_keystore_password, String truststore, String truststore_password, List cipher_suites, String protocol, List accepted_protocols, String algorithm, String store_type, boolean require_client_auth, boolean require_endpoint_verification, Boolean optional, @@ -627,6 +631,8 @@ public ServerEncryptionOptions(ParameterizedClass sslContextFactoryClass, String null, optional); this.internode_encryption = internode_encryption; this.legacy_ssl_storage_port_enabled = legacy_ssl_storage_port_enabled; + this.outbound_keystore = outbound_keystore; + this.outbound_keystore_password = outbound_keystore_password; } public ServerEncryptionOptions(ServerEncryptionOptions options) @@ -634,6 +640,16 @@ public ServerEncryptionOptions(ServerEncryptionOptions options) super(options); this.internode_encryption = options.internode_encryption; this.legacy_ssl_storage_port_enabled = options.legacy_ssl_storage_port_enabled; + this.outbound_keystore = options.outbound_keystore; + this.outbound_keystore_password = options.outbound_keystore_password; + } + + @Override + protected void fillSslContextParams(Map sslContextFactoryParameters) + { + super.fillSslContextParams(sslContextFactoryParameters); + putSslContextFactoryParameter(sslContextFactoryParameters, ConfigKey.OUTBOUND_KEYSTORE, this.outbound_keystore); + putSslContextFactoryParameter(sslContextFactoryParameters, ConfigKey.OUTBOUND_KEYSTORE_PASSWORD, this.outbound_keystore_password); } @Override @@ -697,7 +713,6 @@ public boolean shouldEncrypt(InetAddressAndPort endpoint) * values of "dc" and "all". This method returns the explicit, raw value of {@link #optional} * as set by the user (if set at all). */ - @JsonIgnore public boolean isExplicitlyOptional() { return optional != null && optional; @@ -705,7 +720,8 @@ public boolean isExplicitlyOptional() public ServerEncryptionOptions withSslContextFactory(ParameterizedClass sslContextFactoryClass) { - return new ServerEncryptionOptions(sslContextFactoryClass, keystore, keystore_password, truststore, + return new ServerEncryptionOptions(sslContextFactoryClass, keystore, keystore_password, + outbound_keystore, outbound_keystore_password, truststore, truststore_password, cipher_suites, protocol, accepted_protocols, algorithm, store_type, require_client_auth, require_endpoint_verification, optional, internode_encryption, @@ -714,7 +730,8 @@ public ServerEncryptionOptions withSslContextFactory(ParameterizedClass sslConte public ServerEncryptionOptions withKeyStore(String keystore) { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, truststore, + return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, + outbound_keystore, outbound_keystore_password, truststore, truststore_password, cipher_suites, protocol, accepted_protocols, algorithm, store_type, require_client_auth, require_endpoint_verification, optional, internode_encryption, @@ -723,7 +740,8 @@ public ServerEncryptionOptions withKeyStore(String keystore) public ServerEncryptionOptions withKeyStorePassword(String keystore_password) { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, truststore, + return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, + outbound_keystore, outbound_keystore_password, truststore, truststore_password, cipher_suites, protocol, accepted_protocols, algorithm, store_type, require_client_auth, require_endpoint_verification, optional, internode_encryption, @@ -732,7 +750,8 @@ public ServerEncryptionOptions withKeyStorePassword(String keystore_password) public ServerEncryptionOptions withTrustStore(String truststore) { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, truststore, + return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, + outbound_keystore, outbound_keystore_password, truststore, truststore_password, cipher_suites, protocol, accepted_protocols, algorithm, store_type, require_client_auth, require_endpoint_verification, optional, internode_encryption, @@ -741,7 +760,8 @@ public ServerEncryptionOptions withTrustStore(String truststore) public ServerEncryptionOptions withTrustStorePassword(String truststore_password) { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, truststore, + return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, + outbound_keystore, outbound_keystore_password, truststore, truststore_password, cipher_suites, protocol, accepted_protocols, algorithm, store_type, require_client_auth, require_endpoint_verification, optional, internode_encryption, @@ -750,16 +770,18 @@ public ServerEncryptionOptions withTrustStorePassword(String truststore_password public ServerEncryptionOptions withCipherSuites(List cipher_suites) { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, truststore, + return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, + outbound_keystore, outbound_keystore_password, truststore, truststore_password, cipher_suites, protocol, accepted_protocols, algorithm, store_type, require_client_auth, require_endpoint_verification, optional, internode_encryption, legacy_ssl_storage_port_enabled).applyConfigInternal(); } - public ServerEncryptionOptions withCipherSuites(String ... cipher_suites) + public ServerEncryptionOptions withCipherSuites(String... cipher_suites) { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, truststore, + return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, + outbound_keystore, outbound_keystore_password, truststore, truststore_password, Arrays.asList(cipher_suites), protocol, accepted_protocols, algorithm, store_type, require_client_auth, require_endpoint_verification, optional, internode_encryption, @@ -768,7 +790,8 @@ public ServerEncryptionOptions withCipherSuites(String ... cipher_suites) public ServerEncryptionOptions withProtocol(String protocol) { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, truststore, + return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, + outbound_keystore, outbound_keystore_password, truststore, truststore_password, cipher_suites, protocol, accepted_protocols, algorithm, store_type, require_client_auth, require_endpoint_verification, optional, internode_encryption, @@ -777,7 +800,8 @@ public ServerEncryptionOptions withProtocol(String protocol) public ServerEncryptionOptions withAcceptedProtocols(List accepted_protocols) { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, truststore, + return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, + outbound_keystore, outbound_keystore_password, truststore, truststore_password, cipher_suites, protocol, accepted_protocols, algorithm, store_type, require_client_auth, require_endpoint_verification, optional, internode_encryption, @@ -786,7 +810,8 @@ public ServerEncryptionOptions withAcceptedProtocols(List accepted_proto public ServerEncryptionOptions withAlgorithm(String algorithm) { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, truststore, + return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, + outbound_keystore, outbound_keystore_password, truststore, truststore_password, cipher_suites, protocol, accepted_protocols, algorithm, store_type, require_client_auth, require_endpoint_verification, optional, internode_encryption, @@ -795,7 +820,8 @@ public ServerEncryptionOptions withAlgorithm(String algorithm) public ServerEncryptionOptions withStoreType(String store_type) { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, truststore, + return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, + outbound_keystore, outbound_keystore_password, truststore, truststore_password, cipher_suites, protocol, accepted_protocols, algorithm, store_type, require_client_auth, require_endpoint_verification, optional, internode_encryption, @@ -804,7 +830,8 @@ public ServerEncryptionOptions withStoreType(String store_type) public ServerEncryptionOptions withRequireClientAuth(boolean require_client_auth) { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, truststore, + return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, + outbound_keystore, outbound_keystore_password, truststore, truststore_password, cipher_suites, protocol, accepted_protocols, algorithm, store_type, require_client_auth, require_endpoint_verification, optional, internode_encryption, @@ -813,7 +840,8 @@ public ServerEncryptionOptions withRequireClientAuth(boolean require_client_auth public ServerEncryptionOptions withRequireEndpointVerification(boolean require_endpoint_verification) { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, truststore, + return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, + outbound_keystore, outbound_keystore_password, truststore, truststore_password, cipher_suites, protocol, accepted_protocols, algorithm, store_type, require_client_auth, require_endpoint_verification, optional, internode_encryption, @@ -822,7 +850,8 @@ public ServerEncryptionOptions withRequireEndpointVerification(boolean require_e public ServerEncryptionOptions withOptional(boolean optional) { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, truststore, + return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, + outbound_keystore, outbound_keystore_password, truststore, truststore_password, cipher_suites, protocol, accepted_protocols, algorithm, store_type, require_client_auth, require_endpoint_verification, optional, internode_encryption, @@ -831,7 +860,8 @@ public ServerEncryptionOptions withOptional(boolean optional) public ServerEncryptionOptions withInternodeEncryption(InternodeEncryption internode_encryption) { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, truststore, + return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, + outbound_keystore, outbound_keystore_password, truststore, truststore_password, cipher_suites, protocol, accepted_protocols, algorithm, store_type, require_client_auth, require_endpoint_verification, optional, internode_encryption, @@ -840,12 +870,32 @@ public ServerEncryptionOptions withInternodeEncryption(InternodeEncryption inter public ServerEncryptionOptions withLegacySslStoragePort(boolean enable_legacy_ssl_storage_port) { - return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, truststore, + return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, + outbound_keystore, outbound_keystore_password, truststore, truststore_password, cipher_suites, protocol, accepted_protocols, algorithm, store_type, require_client_auth, require_endpoint_verification, optional, internode_encryption, enable_legacy_ssl_storage_port).applyConfigInternal(); } + public ServerEncryptionOptions withOutboundKeystore(String outboundKeystore) + { + return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, + outboundKeystore, outbound_keystore_password, truststore, + truststore_password, cipher_suites, protocol, accepted_protocols, + algorithm, store_type, require_client_auth, + require_endpoint_verification, optional, internode_encryption, + legacy_ssl_storage_port_enabled).applyConfigInternal(); + } + + public ServerEncryptionOptions withOutboundKeystorePassword(String outboundKeystorePassword) + { + return new ServerEncryptionOptions(ssl_context_factory, keystore, keystore_password, + outbound_keystore, outboundKeystorePassword, truststore, + truststore_password, cipher_suites, protocol, accepted_protocols, + algorithm, store_type, require_client_auth, + require_endpoint_verification, optional, internode_encryption, + legacy_ssl_storage_port_enabled).applyConfigInternal(); + } } } diff --git a/src/java/org/apache/cassandra/net/OutboundConnectionSettings.java b/src/java/org/apache/cassandra/net/OutboundConnectionSettings.java index 599e7178b5a8..db2873d93461 100644 --- a/src/java/org/apache/cassandra/net/OutboundConnectionSettings.java +++ b/src/java/org/apache/cassandra/net/OutboundConnectionSettings.java @@ -82,7 +82,7 @@ public static Framing forId(int id) public final IInternodeAuthenticator authenticator; public final InetAddressAndPort to; public final InetAddressAndPort connectTo; // may be represented by a different IP address on this node's local network - public final EncryptionOptions encryption; + public final ServerEncryptionOptions encryption; public final Framing framing; public final Integer socketSendBufferSizeInBytes; public final Integer applicationSendQueueCapacityInBytes; @@ -112,7 +112,7 @@ public OutboundConnectionSettings(InetAddressAndPort to, InetAddressAndPort pref private OutboundConnectionSettings(IInternodeAuthenticator authenticator, InetAddressAndPort to, InetAddressAndPort connectTo, - EncryptionOptions encryption, + ServerEncryptionOptions encryption, Framing framing, Integer socketSendBufferSizeInBytes, Integer applicationSendQueueCapacityInBytes, @@ -365,7 +365,7 @@ public OutboundDebugCallbacks debug() return debug != null ? debug : OutboundDebugCallbacks.NONE; } - public EncryptionOptions encryption() + public ServerEncryptionOptions encryption() { return encryption != null ? encryption : defaultEncryptionOptions(to); } @@ -499,7 +499,7 @@ private static boolean isInLocalDC(IEndpointSnitch snitch, InetAddressAndPort lo } @VisibleForTesting - static EncryptionOptions defaultEncryptionOptions(InetAddressAndPort endpoint) + static ServerEncryptionOptions defaultEncryptionOptions(InetAddressAndPort endpoint) { ServerEncryptionOptions options = DatabaseDescriptor.getInternodeMessagingEncyptionOptions(); return options.shouldEncrypt(endpoint) ? options : null; diff --git a/src/java/org/apache/cassandra/security/AbstractSslContextFactory.java b/src/java/org/apache/cassandra/security/AbstractSslContextFactory.java index c2ef851bfc51..e4f868f9b107 100644 --- a/src/java/org/apache/cassandra/security/AbstractSslContextFactory.java +++ b/src/java/org/apache/cassandra/security/AbstractSslContextFactory.java @@ -178,15 +178,16 @@ public SslContext createNettySslContext(boolean verifyPeerCertificate, SocketTyp key file in PEM format (see {@link SslContextBuilder#forServer(File, File, String)}). However, we are not supporting that now to keep the config/yaml API simple. */ - KeyManagerFactory kmf = buildKeyManagerFactory(); SslContextBuilder builder; if (socketType == SocketType.SERVER) { + KeyManagerFactory kmf = buildKeyManagerFactory(); builder = SslContextBuilder.forServer(kmf).clientAuth(this.require_client_auth ? ClientAuth.REQUIRE : ClientAuth.NONE); } else { + KeyManagerFactory kmf = buildOutboundKeyManagerFactory(); builder = SslContextBuilder.forClient().keyManager(kmf); } @@ -263,4 +264,12 @@ protected SslProvider getSslProvider() abstract protected KeyManagerFactory buildKeyManagerFactory() throws SSLException; abstract protected TrustManagerFactory buildTrustManagerFactory() throws SSLException; + + /** + * Create a {@code KeyManagerFactory} for outbound connections. + * It provides a seperate keystore for internode mTLS outbound connections. + * @return {@code KeyManagerFactory} + * @throws SSLException + */ + abstract protected KeyManagerFactory buildOutboundKeyManagerFactory() throws SSLException; } diff --git a/src/java/org/apache/cassandra/security/DisableSslContextFactory.java b/src/java/org/apache/cassandra/security/DisableSslContextFactory.java index 9dab062f0be3..8058d0aba6d4 100644 --- a/src/java/org/apache/cassandra/security/DisableSslContextFactory.java +++ b/src/java/org/apache/cassandra/security/DisableSslContextFactory.java @@ -36,12 +36,24 @@ protected TrustManagerFactory buildTrustManagerFactory() throws SSLException throw new UnsupportedOperationException(); } + @Override + protected KeyManagerFactory buildOutboundKeyManagerFactory() throws SSLException + { + throw new UnsupportedOperationException(); + } + @Override public boolean hasKeystore() { return false; } + @Override + public boolean hasOutboundKeystore() + { + return false; + } + @Override public void initHotReloading() throws SSLException { diff --git a/src/java/org/apache/cassandra/security/FileBasedSslContextFactory.java b/src/java/org/apache/cassandra/security/FileBasedSslContextFactory.java index 3d47509fd3bb..5b3ca124ff8c 100644 --- a/src/java/org/apache/cassandra/security/FileBasedSslContextFactory.java +++ b/src/java/org/apache/cassandra/security/FileBasedSslContextFactory.java @@ -33,7 +33,7 @@ import javax.net.ssl.SSLException; import javax.net.ssl.TrustManagerFactory; -import com.google.common.annotations.VisibleForTesting; +import org.apache.commons.lang3.StringUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -47,38 +47,32 @@ * {@code CAUTION:} While this is a useful abstraction, please be careful if you need to modify this class * given possible custom implementations out there! */ -abstract public class FileBasedSslContextFactory extends AbstractSslContextFactory +public abstract class FileBasedSslContextFactory extends AbstractSslContextFactory { private static final Logger logger = LoggerFactory.getLogger(FileBasedSslContextFactory.class); - - @VisibleForTesting - protected volatile boolean checkedExpiry = false; + protected FileBasedStoreContext keystoreContext; + protected FileBasedStoreContext outboundKeystoreContext; + protected FileBasedStoreContext trustStoreContext; /** * List of files that trigger hot reloading of SSL certificates */ protected volatile List hotReloadableFiles = new ArrayList<>(); - protected String keystore; - protected String keystore_password; - protected String truststore; - protected String truststore_password; - public FileBasedSslContextFactory() { - keystore = "conf/.keystore"; - keystore_password = "cassandra"; - truststore = "conf/.truststore"; - truststore_password = "cassandra"; + keystoreContext = new FileBasedStoreContext("conf/.keystore", "cassandra"); + outboundKeystoreContext = new FileBasedStoreContext("conf/.keystore", "cassandra"); + trustStoreContext = new FileBasedStoreContext("conf/.truststore", "cassandra"); } public FileBasedSslContextFactory(Map parameters) { super(parameters); - keystore = getString("keystore"); - keystore_password = getString("keystore_password"); - truststore = getString("truststore"); - truststore_password = getString("truststore_password"); + keystoreContext = new FileBasedStoreContext(getString("keystore"), getString("keystore_password")); + outboundKeystoreContext = new FileBasedStoreContext(StringUtils.defaultString(getString("outbound_keystore"), keystoreContext.filePath), + StringUtils.defaultString(getString("outbound_keystore_password"), keystoreContext.password)); + trustStoreContext = new FileBasedStoreContext(getString("truststore"), getString("truststore_password")); } @Override @@ -90,30 +84,41 @@ public boolean shouldReload() @Override public boolean hasKeystore() { - return keystore != null && new File(keystore).exists(); + return keystoreContext.hasKeystore(); + } + + @Override + public boolean hasOutboundKeystore() + { + return outboundKeystoreContext.hasKeystore(); } private boolean hasTruststore() { - return truststore != null && new File(truststore).exists(); + return trustStoreContext.filePath != null && new File(trustStoreContext.filePath).exists(); } @Override public synchronized void initHotReloading() { boolean hasKeystore = hasKeystore(); + boolean hasOutboundKeystore = hasOutboundKeystore(); boolean hasTruststore = hasTruststore(); - if (hasKeystore || hasTruststore) + if (hasKeystore || hasOutboundKeystore || hasTruststore) { List fileList = new ArrayList<>(); if (hasKeystore) { - fileList.add(new HotReloadableFile(keystore)); + fileList.add(new HotReloadableFile(keystoreContext.filePath)); + } + if (hasOutboundKeystore) + { + fileList.add(new HotReloadableFile(outboundKeystoreContext.filePath)); } if (hasTruststore) { - fileList.add(new HotReloadableFile(truststore)); + fileList.add(new HotReloadableFile(trustStoreContext.filePath)); } hotReloadableFiles = fileList; } @@ -129,25 +134,13 @@ public synchronized void initHotReloading() @Override protected KeyManagerFactory buildKeyManagerFactory() throws SSLException { + return getKeyManagerFactory(keystoreContext); + } - try (InputStream ksf = Files.newInputStream(Paths.get(keystore))) - { - final String algorithm = this.algorithm == null ? KeyManagerFactory.getDefaultAlgorithm() : this.algorithm; - KeyManagerFactory kmf = KeyManagerFactory.getInstance(algorithm); - KeyStore ks = KeyStore.getInstance(store_type); - ks.load(ksf, keystore_password.toCharArray()); - if (!checkedExpiry) - { - checkExpiredCerts(ks); - checkedExpiry = true; - } - kmf.init(ks, keystore_password.toCharArray()); - return kmf; - } - catch (Exception e) - { - throw new SSLException("failed to build key manager store for secure connections", e); - } + @Override + protected KeyManagerFactory buildOutboundKeyManagerFactory() throws SSLException + { + return getKeyManagerFactory(outboundKeystoreContext); } /** @@ -159,12 +152,12 @@ protected KeyManagerFactory buildKeyManagerFactory() throws SSLException @Override protected TrustManagerFactory buildTrustManagerFactory() throws SSLException { - try (InputStream tsf = Files.newInputStream(Paths.get(truststore))) + try (InputStream tsf = Files.newInputStream(Paths.get(trustStoreContext.filePath))) { final String algorithm = this.algorithm == null ? TrustManagerFactory.getDefaultAlgorithm() : this.algorithm; TrustManagerFactory tmf = TrustManagerFactory.getInstance(algorithm); KeyStore ts = KeyStore.getInstance(store_type); - ts.load(tsf, truststore_password.toCharArray()); + ts.load(tsf, trustStoreContext.password.toCharArray()); tmf.init(ts); return tmf; } @@ -174,6 +167,29 @@ protected TrustManagerFactory buildTrustManagerFactory() throws SSLException } } + private KeyManagerFactory getKeyManagerFactory(final FileBasedStoreContext context) throws SSLException + { + try (InputStream ksf = Files.newInputStream(Paths.get(context.filePath))) + { + final String algorithm = this.algorithm == null ? KeyManagerFactory.getDefaultAlgorithm() : this.algorithm; + KeyManagerFactory kmf = KeyManagerFactory.getInstance(algorithm); + KeyStore ks = KeyStore.getInstance(store_type); + ks.load(ksf, context.password.toCharArray()); + + if (!context.checkedExpiry) + { + checkExpiredCerts(ks); + context.checkedExpiry = true; + } + kmf.init(ks, context.password.toCharArray()); + return kmf; + } + catch (Exception e) + { + throw new SSLException("failed to build key manager store for secure connections", e); + } + } + protected boolean checkExpiredCerts(KeyStore ks) throws KeyStoreException { boolean hasExpiredCerts = false; @@ -225,4 +241,27 @@ public String toString() '}'; } } + + protected static class FileBasedStoreContext + { + public volatile boolean checkedExpiry = false; + public String filePath; + public String password; + + public FileBasedStoreContext(String keystore, String keystorePassword) + { + this.filePath = keystore; + this.password = keystorePassword; + } + + protected boolean hasKeystore() + { + return filePath != null && new File(filePath).exists(); + } + + protected boolean passwordMatchesIfPresent(String keyPassword) + { + return StringUtils.isEmpty(password) || keyPassword.equals(password); + } + } } diff --git a/src/java/org/apache/cassandra/security/ISslContextFactory.java b/src/java/org/apache/cassandra/security/ISslContextFactory.java index 579c95e43a40..11c4717b3c1f 100644 --- a/src/java/org/apache/cassandra/security/ISslContextFactory.java +++ b/src/java/org/apache/cassandra/security/ISslContextFactory.java @@ -99,6 +99,16 @@ default boolean hasKeystore() return true; } + /** + * Returns if this factory uses outbound keystore. + * + * @return {@code true} by default unless the implementation overrides this + */ + default boolean hasOutboundKeystore() + { + return false; + } + /** * Returns the prepared list of accepted protocols. * diff --git a/src/java/org/apache/cassandra/security/PEMBasedSslContextFactory.java b/src/java/org/apache/cassandra/security/PEMBasedSslContextFactory.java index 8ecbec59ef51..3d3ecc21fdd4 100644 --- a/src/java/org/apache/cassandra/security/PEMBasedSslContextFactory.java +++ b/src/java/org/apache/cassandra/security/PEMBasedSslContextFactory.java @@ -90,47 +90,55 @@ public final class PEMBasedSslContextFactory extends FileBasedSslContextFactory { public static final String DEFAULT_TARGET_STORETYPE = "PKCS12"; private static final Logger logger = LoggerFactory.getLogger(PEMBasedSslContextFactory.class); - private String pemEncodedKey; - private String keyPassword; - private String pemEncodedCertificates; - private boolean maybeFileBasedPrivateKey; - private boolean maybeFileBasedTrustedCertificates; + private PEMBasedKeyStoreContext pemEncodedTrustCertificates; + private PEMBasedKeyStoreContext pemEncodedKeyContext; + private PEMBasedKeyStoreContext pemEncodedOutboundKeyContext; public PEMBasedSslContextFactory() { } - public PEMBasedSslContextFactory(Map parameters) + private void validatePasswords() { - super(parameters); - pemEncodedKey = getString(ConfigKey.ENCODED_KEY.getKeyName()); - keyPassword = getString(ConfigKey.KEY_PASSWORD.getKeyName()); - if (StringUtils.isEmpty(keyPassword)) - { - keyPassword = keystore_password; - } - else if (!StringUtils.isEmpty(keystore_password) && !keyPassword.equals(keystore_password)) + boolean shouldThrow = !keystoreContext.passwordMatchesIfPresent(pemEncodedKeyContext.password) + || !outboundKeystoreContext.passwordMatchesIfPresent(pemEncodedOutboundKeyContext.password); + boolean outboundPasswordMismatch = !outboundKeystoreContext.passwordMatchesIfPresent(pemEncodedOutboundKeyContext.password); + String keyName = outboundPasswordMismatch ? "outbound_" : ""; + + if (shouldThrow) { - throw new IllegalArgumentException("'keystore_password' and 'key_password' both configurations are given and the " + - "values do not match"); + final String msg = String.format("'%skeystore_password' and '%skey_password' both configurations are given and the values do not match", keyName, keyName); + throw new IllegalArgumentException(msg); } else { - logger.warn("'keystore_password' and 'key_password' both are configured but since the values match it's " + - "okay. Ideally you should only specify one of them."); + logger.warn("'{}keystore_password' and '{}key_password' both are configured but since the values match it's " + + "okay. Ideally you should only specify one of them.", keyName, keyName); } + } + + public PEMBasedSslContextFactory(Map parameters) + { + super(parameters); + final String pemEncodedKey = getString(ConfigKey.ENCODED_KEY.getKeyName()); + final String pemEncodedKeyPassword = StringUtils.defaultString(getString(ConfigKey.KEY_PASSWORD.getKeyName()), keystoreContext.password); + pemEncodedKeyContext = new PEMBasedKeyStoreContext(pemEncodedKey, pemEncodedKeyPassword, StringUtils.isEmpty(pemEncodedKey), keystoreContext); + + final String pemEncodedOutboundKey = StringUtils.defaultString(getString(ConfigKey.OUTBOUND_ENCODED_KEY.getKeyName()), pemEncodedKey); + final String outboundKeyPassword = StringUtils.defaultString(StringUtils.defaultString(getString(ConfigKey.OUTBOUND_ENCODED_KEY_PASSWORD.getKeyName()), + outboundKeystoreContext.password), pemEncodedKeyPassword); + pemEncodedOutboundKeyContext = new PEMBasedKeyStoreContext(pemEncodedKey, outboundKeyPassword, StringUtils.isEmpty(pemEncodedOutboundKey), outboundKeystoreContext); + + validatePasswords(); - if (!StringUtils.isEmpty(truststore_password)) + if (!StringUtils.isEmpty(trustStoreContext.password)) { logger.warn("PEM based truststore should not be using password. Ignoring the given value in " + "'truststore_password' configuration."); } - pemEncodedCertificates = getString(ConfigKey.ENCODED_CERTIFICATES.getKeyName()); - - maybeFileBasedPrivateKey = StringUtils.isEmpty(pemEncodedKey); - maybeFileBasedTrustedCertificates = StringUtils.isEmpty(pemEncodedCertificates); - + final String pemEncodedCerts = getString(ConfigKey.ENCODED_CERTIFICATES.getKeyName()); + pemEncodedTrustCertificates = new PEMBasedKeyStoreContext(pemEncodedCerts, null, StringUtils.isEmpty(pemEncodedCerts), trustStoreContext); enforceSinglePrivateKeySource(); enforceSingleTurstedCertificatesSource(); } @@ -143,18 +151,22 @@ else if (!StringUtils.isEmpty(keystore_password) && !keyPassword.equals(keystore @Override public boolean hasKeystore() { - return maybeFileBasedPrivateKey ? keystoreFileExists() : - !StringUtils.isEmpty(pemEncodedKey); + return pemEncodedKeyContext.maybeFilebasedKey + ? keystoreContext.hasKeystore() + : !StringUtils.isEmpty(pemEncodedKeyContext.key); } /** - * Checks if the keystore file exists. + * Decides if this factory has an outbound keystore defined - key material specified in files or inline to the configuration. * - * @return {@code true} if keystore file exists; {@code false} otherwise + * @return {@code true} if there is an outbound keystore defined; {@code false} otherwise */ - private boolean keystoreFileExists() + @Override + public boolean hasOutboundKeystore() { - return keystore != null && new File(keystore).exists(); + return pemEncodedOutboundKeyContext.maybeFilebasedKey + ? outboundKeystoreContext.hasKeystore() + : !StringUtils.isEmpty(pemEncodedOutboundKeyContext.key); } /** @@ -165,8 +177,8 @@ private boolean keystoreFileExists() */ private boolean hasTruststore() { - return maybeFileBasedTrustedCertificates ? truststoreFileExists() : - !StringUtils.isEmpty(pemEncodedCertificates); + return pemEncodedTrustCertificates.maybeFilebasedKey ? truststoreFileExists() : + !StringUtils.isEmpty(pemEncodedTrustCertificates.key); } /** @@ -176,7 +188,7 @@ private boolean hasTruststore() */ private boolean truststoreFileExists() { - return truststore != null && new File(truststore).exists(); + return trustStoreContext.filePath != null && new File(trustStoreContext.filePath).exists(); } /** @@ -186,13 +198,17 @@ private boolean truststoreFileExists() public synchronized void initHotReloading() { List fileList = new ArrayList<>(); - if (maybeFileBasedPrivateKey && hasKeystore()) + if (pemEncodedKeyContext.maybeFilebasedKey && hasKeystore()) + { + fileList.add(new HotReloadableFile(keystoreContext.filePath)); + } + if (pemEncodedOutboundKeyContext.maybeFilebasedKey && hasOutboundKeystore()) { - fileList.add(new HotReloadableFile(keystore)); + fileList.add(new HotReloadableFile(outboundKeystoreContext.filePath)); } - if (maybeFileBasedTrustedCertificates && hasTruststore()) + if (pemEncodedTrustCertificates.maybeFilebasedKey && hasTruststore()) { - fileList.add(new HotReloadableFile(truststore)); + fileList.add(new HotReloadableFile(trustStoreContext.filePath)); } if (!fileList.isEmpty()) { @@ -209,30 +225,41 @@ public synchronized void initHotReloading() */ @Override protected KeyManagerFactory buildKeyManagerFactory() throws SSLException + { + return buildKeyManagerFactory(pemEncodedKeyContext, keystoreContext); + } + + @Override + protected KeyManagerFactory buildOutboundKeyManagerFactory() throws SSLException + { + return buildKeyManagerFactory(pemEncodedOutboundKeyContext, outboundKeystoreContext); + } + + private KeyManagerFactory buildKeyManagerFactory(PEMBasedKeyStoreContext pemBasedKeyStoreContext, FileBasedStoreContext keyStoreContext) throws SSLException { try { - if (hasKeystore()) + if (pemBasedKeyStoreContext.hasKey()) { - if (maybeFileBasedPrivateKey) + if (pemBasedKeyStoreContext.maybeFilebasedKey) { - pemEncodedKey = readPEMFile(keystore); // read PEM from the file + pemBasedKeyStoreContext.key = readPEMFile(keyStoreContext.filePath); // read PEM from the file } KeyManagerFactory kmf = KeyManagerFactory.getInstance( algorithm == null ? KeyManagerFactory.getDefaultAlgorithm() : algorithm); - KeyStore ks = buildKeyStore(); - if (!checkedExpiry) + KeyStore ks = buildKeyStore(pemBasedKeyStoreContext.key, pemBasedKeyStoreContext.password); + if (!keyStoreContext.checkedExpiry) { checkExpiredCerts(ks); - checkedExpiry = true; + keyStoreContext.checkedExpiry = true; } - kmf.init(ks, keyPassword != null ? keyPassword.toCharArray() : null); + kmf.init(ks, pemBasedKeyStoreContext.password != null ? pemBasedKeyStoreContext.password.toCharArray() : null); return kmf; } else { - throw new SSLException("Must provide keystore or private_key in configuration for PEMBasedSSlContextFactory"); + throw new SSLException("Must provide outbound_keystore or outbound_private_key in configuration for PEMBasedSSlContextFactory"); } } catch (Exception e) @@ -254,9 +281,9 @@ protected TrustManagerFactory buildTrustManagerFactory() throws SSLException { if (hasTruststore()) { - if (maybeFileBasedTrustedCertificates) + if (pemEncodedTrustCertificates.maybeFilebasedKey) { - pemEncodedCertificates = readPEMFile(truststore); // read PEM from the file + pemEncodedTrustCertificates.key = readPEMFile(trustStoreContext.filePath); // read PEM from the file } TrustManagerFactory tmf = TrustManagerFactory.getInstance( @@ -286,7 +313,7 @@ private String readPEMFile(String file) throws IOException * Builds KeyStore object given the {@link #DEFAULT_TARGET_STORETYPE} out of the PEM formatted private key material. * It uses {@code cassandra-ssl-keystore} as the alias for the created key-entry. */ - private KeyStore buildKeyStore() throws GeneralSecurityException, IOException + private static KeyStore buildKeyStore(final String pemEncodedKey, final String keyPassword) throws GeneralSecurityException, IOException { char[] keyPasswordArray = keyPassword != null ? keyPassword.toCharArray() : null; PrivateKey privateKey = PEMReader.extractPrivateKey(pemEncodedKey, keyPassword); @@ -310,7 +337,7 @@ private KeyStore buildKeyStore() throws GeneralSecurityException, IOException */ private KeyStore buildTrustStore() throws GeneralSecurityException, IOException { - Certificate[] certChainArray = PEMReader.extractCertificates(pemEncodedCertificates); + Certificate[] certChainArray = PEMReader.extractCertificates(pemEncodedTrustCertificates.key); if (certChainArray == null || certChainArray.length == 0) { throw new SSLException("Could not read any certificates from the given PEM"); @@ -331,11 +358,16 @@ private KeyStore buildTrustStore() throws GeneralSecurityException, IOException */ private void enforceSinglePrivateKeySource() { - if (keystoreFileExists() && !StringUtils.isEmpty(pemEncodedKey)) + if (keystoreContext.hasKeystore() && !StringUtils.isEmpty(pemEncodedKeyContext.key)) { throw new IllegalArgumentException("Configuration must specify value for either keystore or private_key, " + "not both for PEMBasedSSlContextFactory"); } + if (outboundKeystoreContext.hasKeystore() && !StringUtils.isEmpty(pemEncodedOutboundKeyContext.key)) + { + throw new IllegalArgumentException("Configuration must specify value for either outbound_keystore or outbound_private_key, " + + "not both for PEMBasedSSlContextFactory"); + } } /** @@ -344,17 +376,43 @@ private void enforceSinglePrivateKeySource() */ private void enforceSingleTurstedCertificatesSource() { - if (truststoreFileExists() && !StringUtils.isEmpty(pemEncodedCertificates)) + if (truststoreFileExists() && !StringUtils.isEmpty(pemEncodedTrustCertificates.key)) { throw new IllegalArgumentException("Configuration must specify value for either truststore or " + "trusted_certificates, not both for PEMBasedSSlContextFactory"); } } + public static class PEMBasedKeyStoreContext + { + public String key; + public final String password; + public final boolean maybeFilebasedKey; + public final FileBasedStoreContext filebasedKeystoreContext; + + public PEMBasedKeyStoreContext(final String encodedKey, final String getEncodedKeyPassword, + final boolean maybeFilebasedKey, final FileBasedStoreContext filebasedKeystoreContext) + { + this.key = encodedKey; + this.password = getEncodedKeyPassword; + this.maybeFilebasedKey = maybeFilebasedKey; + this.filebasedKeystoreContext = filebasedKeystoreContext; + } + + public boolean hasKey() + { + return maybeFilebasedKey + ? filebasedKeystoreContext.hasKeystore() + : !StringUtils.isEmpty(key); + } + } + public enum ConfigKey { ENCODED_KEY("private_key"), KEY_PASSWORD("private_key_password"), + OUTBOUND_ENCODED_KEY("outbound_private_key"), + OUTBOUND_ENCODED_KEY_PASSWORD("outbound_private_key_password"), ENCODED_CERTIFICATES("trusted_certificates"); final String keyName; diff --git a/test/conf/cassandra_ssl_test_outbound.keystore b/test/conf/cassandra_ssl_test_outbound.keystore new file mode 100644 index 0000000000000000000000000000000000000000..7dbf466e5abe7036386dead82c9faabc79d2084e GIT binary patch literal 2286 zcmc(g_fyk(6UXyS2m%2TK_Ey8H4us>AVnz=H5^KBg7i?7-U$NIf&$Wyj&x}UCpoHu z2uc-^A_9gMK~IoF;i!gCa>w0_^Eh*V!1Ke-?7nv9GoN|y&U=|_%ry`Q1U?Sn-{5f} z1_cp)T?2`ZK|$V*!EQmp$E}#9=#v8=5EQ_YS&p#WtQ_*J01F@oJ_!I&5DS^*4|Qu_ zyE%#gg5$xPeaTk%2Cj2p(2q@s;+d(gweX2g)Cgiye*qh z%fGaI$*JL5Ii^PEl?6%VfzzMlvirZv>uOHFMXXOf2T=OaiV&)|0UEJKu z0ueF8iNqHTG)Iu5_1DELMjl_a&MUnWmrq|zc~KV3NAQ}~m*{?&yNFb1yZ6|Vw2eIv zz4D;|DjZS4CYXsTo3h^zM)4DP77?RU3=yggTI6KtFcp@@P2q`MKEG#P#mbf%1-J8^TTWzCH_Yc!e=?@c``)KR)--aYuMD*5@0jOp zn@>vX0|bN~O0$iSbA7&X*z#I0zz7Y7?VhUIMdcHLfQQJo#u1LMH_YxClV1)?Z6R*#)5|TA!wHPH=1tQnqJspI{I!Jatz?WDF z3qpx%?SJiJddH_H8+IC_2p(vw1f={G6lL^_jV1Pxa^L$mOkq(encO@nEZAxo=9^aa zFf6=XRX!OnRY30}(bok%`GE49B3?rzgd5w(xy6{q$>sA0O;nvFKJ{`^hNvm1)z+V#b3ubwGP%&E(&PB`TB z(2CeFZ0?yVU%a}`S4m2-_b!_|LbqscH%HOTF~y#Gb9=C_a@Ie6e4EKNkOeI6+$=}m zd}Jx+V7p38F$^8KQKT(pyL@@{E5`pY^W~6PV0JZ3E$)b76Bi~+Mz0{F$MLcuIclq) zU%2#P_iGwsc=xPkKE`tAMQ`C$Zuf0L>2=}RE*nD1A=9nHj-~~TNww*r(J$lSvNN7C zrt7<;`PNSNVRb(!j<;m0(euefNz5WTq=hf72vuChZ{3xGDyCb;6>qx#``N$ zUjLdT)cTAH_B;p#d4MHDaG5AvCH3W94jiorKE&aQNm%>l#Uad<5KyD^8fiYG9dAzP{+3fLI&U(>&qy6ic`&Q=&1z8&UL?SA7r#om@I>MTk#jdu!6d2u@S&B8RFU#X2; zYWpdJ17)+&vFa83)aAV0XQsxM#R%2G2l=Q^^Wmk4ttws6rWJzwkSZPPpdKJ@*|BW( z`IaKN+ig^XoMdfKS7j0#KjFkpwJRJgP}a6iu?1j&tJWP_VzkNr;sI3X{ROU-)8q2f z$nkVVxLNpFPCeee;RPpnXDe(CZO$Z%U9ni_<^EB`<83<@5df9H_^_AJs^9uJ zX0WV{vlZNFll;kGjLtK&FLwI(PwWc|If&3vv+Rnd!>MM8i4QKR*nk0lgW1CT!)Ds5 z`h9v2sYuhU$Rt0}H&nqQr*ker%X*mKpQ&A(=GS&0q*Er0Y@hogG5AW1&$gMl_yvM0WH3}Uj`XCEN!>Z#3rUXs%H8Ahbk^y>sYn5$5x7YG{XPw$#!c)TPwikf zKCB(RU9`eq=Aly%d=>Lr%5Eu&(4RoxDYxhfk#FdRMOCOOcL7cL(O`c()), keystorePath, "dummypass", "dummytruststore", "dummypass", + new HashMap<>()), keystorePath, "dummypass", keystorePath, "dummypass", "dummytruststore", "dummypass", Collections.emptyList(), null, null, null, "JKS", false, false, optional, internodeEncryption, false) .applyConfig(), expected, diff --git a/test/unit/org/apache/cassandra/security/DefaultSslContextFactoryTest.java b/test/unit/org/apache/cassandra/security/DefaultSslContextFactoryTest.java index 13d1faca04ec..3edf9c188e8e 100644 --- a/test/unit/org/apache/cassandra/security/DefaultSslContextFactoryTest.java +++ b/test/unit/org/apache/cassandra/security/DefaultSslContextFactoryTest.java @@ -53,15 +53,23 @@ private void addKeystoreOptions(Map config) config.put("keystore_password", "cassandra"); } + private void addOutboundKeystoreOptions(Map config) + { + config.put("outbound_keystore", "test/conf/cassandra_ssl_test_outbound.keystore"); + config.put("outbound_keystore_password", "cassandra"); + } + @Test public void getSslContextOpenSSL() throws IOException { - EncryptionOptions options = new EncryptionOptions().withTrustStore("test/conf/cassandra_ssl_test.truststore") - .withTrustStorePassword("cassandra") - .withKeyStore("test/conf/cassandra_ssl_test.keystore") - .withKeyStorePassword("cassandra") - .withRequireClientAuth(false) - .withCipherSuites("TLS_RSA_WITH_AES_128_CBC_SHA"); + EncryptionOptions.ServerEncryptionOptions options = new EncryptionOptions.ServerEncryptionOptions().withTrustStore("test/conf/cassandra_ssl_test.truststore") + .withTrustStorePassword("cassandra") + .withKeyStore("test/conf/cassandra_ssl_test.keystore") + .withKeyStorePassword("cassandra") + .withOutboundKeystore("test/conf/cassandra_ssl_test_outbound.keystore") + .withOutboundKeystorePassword("cassandra") + .withRequireClientAuth(false) + .withCipherSuites("TLS_RSA_WITH_AES_128_CBC_SHA"); SslContext sslContext = SSLFactory.getOrCreateSslContext(options, true, ISslContextFactory.SocketType.CLIENT); Assert.assertNotNull(sslContext); if (OpenSsl.isAvailable()) @@ -78,7 +86,7 @@ public void buildTrustManagerFactoryWithInvalidTruststoreFile() throws IOExcepti config.put("truststore", "/this/is/probably/not/a/file/on/your/test/machine"); DefaultSslContextFactory defaultSslContextFactoryImpl = new DefaultSslContextFactory(config); - defaultSslContextFactoryImpl.checkedExpiry = false; + defaultSslContextFactoryImpl.keystoreContext.checkedExpiry = false; defaultSslContextFactoryImpl.buildTrustManagerFactory(); } @@ -90,7 +98,7 @@ public void buildTrustManagerFactoryWithBadPassword() throws IOException config.put("truststore_password", "HomeOfBadPasswords"); DefaultSslContextFactory defaultSslContextFactoryImpl = new DefaultSslContextFactory(config); - defaultSslContextFactoryImpl.checkedExpiry = false; + defaultSslContextFactoryImpl.keystoreContext.checkedExpiry = false; defaultSslContextFactoryImpl.buildTrustManagerFactory(); } @@ -101,7 +109,7 @@ public void buildTrustManagerFactoryHappyPath() throws IOException config.putAll(commonConfig); DefaultSslContextFactory defaultSslContextFactoryImpl = new DefaultSslContextFactory(config); - defaultSslContextFactoryImpl.checkedExpiry = false; + defaultSslContextFactoryImpl.keystoreContext.checkedExpiry = false; TrustManagerFactory trustManagerFactory = defaultSslContextFactoryImpl.buildTrustManagerFactory(); Assert.assertNotNull(trustManagerFactory); } @@ -114,7 +122,7 @@ public void buildKeyManagerFactoryWithInvalidKeystoreFile() throws IOException config.put("keystore", "/this/is/probably/not/a/file/on/your/test/machine"); DefaultSslContextFactory defaultSslContextFactoryImpl = new DefaultSslContextFactory(config); - defaultSslContextFactoryImpl.checkedExpiry = false; + defaultSslContextFactoryImpl.keystoreContext.checkedExpiry = false; defaultSslContextFactoryImpl.buildKeyManagerFactory(); } @@ -138,20 +146,70 @@ public void buildKeyManagerFactoryHappyPath() throws IOException DefaultSslContextFactory defaultSslContextFactoryImpl = new DefaultSslContextFactory(config); // Make sure the exiry check didn't happen so far for the private key - Assert.assertFalse(defaultSslContextFactoryImpl.checkedExpiry); + Assert.assertFalse(defaultSslContextFactoryImpl.keystoreContext.checkedExpiry); addKeystoreOptions(config); DefaultSslContextFactory defaultSslContextFactoryImpl2 = new DefaultSslContextFactory(config); // Trigger the private key loading. That will also check for expired private key defaultSslContextFactoryImpl2.buildKeyManagerFactory(); // Now we should have checked the private key's expiry - Assert.assertTrue(defaultSslContextFactoryImpl2.checkedExpiry); + Assert.assertTrue(defaultSslContextFactoryImpl2.keystoreContext.checkedExpiry); // Make sure that new factory object preforms the fresh private key expiry check DefaultSslContextFactory defaultSslContextFactoryImpl3 = new DefaultSslContextFactory(config); - Assert.assertFalse(defaultSslContextFactoryImpl3.checkedExpiry); + Assert.assertFalse(defaultSslContextFactoryImpl3.keystoreContext.checkedExpiry); defaultSslContextFactoryImpl3.buildKeyManagerFactory(); - Assert.assertTrue(defaultSslContextFactoryImpl3.checkedExpiry); + Assert.assertTrue(defaultSslContextFactoryImpl3.keystoreContext.checkedExpiry); + } + + @Test(expected = IOException.class) + public void buildOutboundKeyManagerFactoryWithInvalidKeystoreFile() throws IOException + { + Map config = new HashMap<>(); + config.putAll(commonConfig); + config.put("outbound_keystore", "/this/is/probably/not/a/file/on/your/test/machine"); + + DefaultSslContextFactory defaultSslContextFactoryImpl = new DefaultSslContextFactory(config); + defaultSslContextFactoryImpl.outboundKeystoreContext.checkedExpiry = false; + defaultSslContextFactoryImpl.buildOutboundKeyManagerFactory(); + } + + @Test(expected = IOException.class) + public void buildOutboundKeyManagerFactoryWithBadPassword() throws IOException + { + Map config = new HashMap<>(); + config.putAll(commonConfig); + addOutboundKeystoreOptions(config); + config.put("outbound_keystore_password", "HomeOfBadPasswords"); + + DefaultSslContextFactory defaultSslContextFactoryImpl = new DefaultSslContextFactory(config); + defaultSslContextFactoryImpl.buildKeyManagerFactory(); + } + + @Test + public void buildOutboundKeyManagerFactoryHappyPath() throws IOException + { + Map config = new HashMap<>(); + config.putAll(commonConfig); + + DefaultSslContextFactory defaultSslContextFactoryImpl = new DefaultSslContextFactory(config); + // Make sure the exiry check didn't happen so far for the private key + Assert.assertFalse(defaultSslContextFactoryImpl.outboundKeystoreContext.checkedExpiry); + + addOutboundKeystoreOptions(config); + DefaultSslContextFactory defaultSslContextFactoryImpl2 = new DefaultSslContextFactory(config); + // Trigger the private key loading. That will also check for expired private key + defaultSslContextFactoryImpl2.buildOutboundKeyManagerFactory(); + // Now we should have checked the private key's expiry + Assert.assertTrue(defaultSslContextFactoryImpl2.outboundKeystoreContext.checkedExpiry); + Assert.assertFalse(defaultSslContextFactoryImpl2.keystoreContext.checkedExpiry); + + // Make sure that new factory object preforms the fresh private key expiry check + DefaultSslContextFactory defaultSslContextFactoryImpl3 = new DefaultSslContextFactory(config); + Assert.assertFalse(defaultSslContextFactoryImpl3.outboundKeystoreContext.checkedExpiry); + defaultSslContextFactoryImpl3.buildOutboundKeyManagerFactory(); + Assert.assertTrue(defaultSslContextFactoryImpl3.outboundKeystoreContext.checkedExpiry); + Assert.assertFalse(defaultSslContextFactoryImpl2.keystoreContext.checkedExpiry); } @Test diff --git a/test/unit/org/apache/cassandra/security/PEMBasedSslContextFactoryTest.java b/test/unit/org/apache/cassandra/security/PEMBasedSslContextFactoryTest.java index 243d300539c8..f919a1994eea 100644 --- a/test/unit/org/apache/cassandra/security/PEMBasedSslContextFactoryTest.java +++ b/test/unit/org/apache/cassandra/security/PEMBasedSslContextFactoryTest.java @@ -216,6 +216,27 @@ public void getSslContextOpenSSL() throws IOException .withRequireClientAuth(false) .withCipherSuites("TLS_RSA_WITH_AES_128_CBC_SHA") .withSslContextFactory(sslContextFactory); + SslContext sslContext = SSLFactory.getOrCreateSslContext(options, true, ISslContextFactory.SocketType.SERVER); + Assert.assertNotNull(sslContext); + if (OpenSsl.isAvailable()) + Assert.assertTrue(sslContext instanceof OpenSslContext); + else + Assert.assertTrue(sslContext instanceof SslContext); + } + + @Test + public void getSslContextOpenSSLOutboundKeystore() throws IOException + { + ParameterizedClass sslContextFactory = new ParameterizedClass(PEMBasedSslContextFactory.class.getSimpleName() + , new HashMap<>()); + EncryptionOptions.ServerEncryptionOptions options = new EncryptionOptions.ServerEncryptionOptions().withTrustStore("test/conf/cassandra_ssl_test.truststore.pem") + .withKeyStore("test/conf/cassandra_ssl_test.keystore.pem") + .withKeyStorePassword("cassandra") + .withOutboundKeystore("test/conf/cassandra_ssl_test.keystore.pem") + .withOutboundKeystorePassword("cassandra") + .withRequireClientAuth(false) + .withCipherSuites("TLS_RSA_WITH_AES_128_CBC_SHA") + .withSslContextFactory(sslContextFactory); SslContext sslContext = SSLFactory.getOrCreateSslContext(options, true, ISslContextFactory.SocketType.CLIENT); Assert.assertNotNull(sslContext); if (OpenSsl.isAvailable()) @@ -233,7 +254,7 @@ public void buildTrustManagerFactoryWithInvalidTruststoreFile() throws IOExcepti config.put("truststore", "/this/is/probably/not/a/file/on/your/test/machine"); DefaultSslContextFactory defaultSslContextFactoryImpl = new DefaultSslContextFactory(config); - defaultSslContextFactoryImpl.checkedExpiry = false; + defaultSslContextFactoryImpl.keystoreContext.checkedExpiry = false; defaultSslContextFactoryImpl.buildTrustManagerFactory(); } @@ -244,7 +265,7 @@ public void buildTrustManagerFactoryHappyPath() throws IOException config.putAll(commonConfig); PEMBasedSslContextFactory sslContextFactory = new PEMBasedSslContextFactory(config); - sslContextFactory.checkedExpiry = false; + sslContextFactory.keystoreContext.checkedExpiry = false; TrustManagerFactory trustManagerFactory = sslContextFactory.buildTrustManagerFactory(); Assert.assertNotNull(trustManagerFactory); } @@ -258,7 +279,7 @@ public void buildFileBasedTrustManagerFactoryHappyPath() throws IOException addFileBaseTrustStoreOptions(config); PEMBasedSslContextFactory sslContextFactory = new PEMBasedSslContextFactory(config); - sslContextFactory.checkedExpiry = false; + sslContextFactory.keystoreContext.checkedExpiry = false; TrustManagerFactory trustManagerFactory = sslContextFactory.buildTrustManagerFactory(); Assert.assertNotNull(trustManagerFactory); } @@ -271,7 +292,7 @@ public void buildKeyManagerFactoryWithInvalidKeystoreFile() throws IOException config.put("keystore", "/this/is/probably/not/a/file/on/your/test/machine"); PEMBasedSslContextFactory sslContextFactory = new PEMBasedSslContextFactory(config); - sslContextFactory.checkedExpiry = false; + sslContextFactory.keystoreContext.checkedExpiry = false; sslContextFactory.buildKeyManagerFactory(); } @@ -295,20 +316,20 @@ public void buildKeyManagerFactoryHappyPath() throws IOException PEMBasedSslContextFactory sslContextFactory1 = new PEMBasedSslContextFactory(config); // Make sure the exiry check didn't happen so far for the private key - Assert.assertFalse(sslContextFactory1.checkedExpiry); + Assert.assertFalse(sslContextFactory1.keystoreContext.checkedExpiry); addKeyStoreOptions(config); PEMBasedSslContextFactory sslContextFactory2 = new PEMBasedSslContextFactory(config); // Trigger the private key loading. That will also check for expired private key sslContextFactory2.buildKeyManagerFactory(); // Now we should have checked the private key's expiry - Assert.assertTrue(sslContextFactory2.checkedExpiry); + Assert.assertTrue(sslContextFactory2.keystoreContext.checkedExpiry); // Make sure that new factory object preforms the fresh private key expiry check PEMBasedSslContextFactory sslContextFactory3 = new PEMBasedSslContextFactory(config); - Assert.assertFalse(sslContextFactory3.checkedExpiry); + Assert.assertFalse(sslContextFactory3.keystoreContext.checkedExpiry); sslContextFactory3.buildKeyManagerFactory(); - Assert.assertTrue(sslContextFactory3.checkedExpiry); + Assert.assertTrue(sslContextFactory3.keystoreContext.checkedExpiry); } @Test(expected = IllegalArgumentException.class) @@ -343,20 +364,20 @@ public void buildFileBasedKeyManagerFactoryHappyPath() throws IOException PEMBasedSslContextFactory sslContextFactory1 = new PEMBasedSslContextFactory(config); // Make sure the expiry check didn't happen so far for the private key - Assert.assertFalse(sslContextFactory1.checkedExpiry); + Assert.assertFalse(sslContextFactory1.keystoreContext.checkedExpiry); addFileBaseKeyStoreOptions(config); PEMBasedSslContextFactory sslContextFactory2 = new PEMBasedSslContextFactory(config); // Trigger the private key loading. That will also check for expired private key sslContextFactory2.buildKeyManagerFactory(); // Now we should have checked the private key's expiry - Assert.assertTrue(sslContextFactory2.checkedExpiry); + Assert.assertTrue(sslContextFactory2.keystoreContext.checkedExpiry); // Make sure that new factory object preforms the fresh private key expiry check PEMBasedSslContextFactory sslContextFactory3 = new PEMBasedSslContextFactory(config); - Assert.assertFalse(sslContextFactory3.checkedExpiry); + Assert.assertFalse(sslContextFactory3.keystoreContext.checkedExpiry); sslContextFactory3.buildKeyManagerFactory(); - Assert.assertTrue(sslContextFactory3.checkedExpiry); + Assert.assertTrue(sslContextFactory3.keystoreContext.checkedExpiry); } @Test diff --git a/test/unit/org/apache/cassandra/security/SSLFactoryTest.java b/test/unit/org/apache/cassandra/security/SSLFactoryTest.java index e5aa4b10572d..ff3bab9d2625 100644 --- a/test/unit/org/apache/cassandra/security/SSLFactoryTest.java +++ b/test/unit/org/apache/cassandra/security/SSLFactoryTest.java @@ -18,11 +18,19 @@ */ package org.apache.cassandra.security; -import org.apache.cassandra.io.util.File; +import java.io.FileInputStream; import java.io.IOException; +import java.lang.reflect.Field; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.security.KeyStore; +import java.security.KeyStoreException; +import java.security.NoSuchAlgorithmException; +import java.security.cert.Certificate; import java.security.cert.CertificateException; import java.util.HashMap; import java.util.Map; +import javax.net.ssl.X509KeyManager; import org.apache.commons.io.FileUtils; import org.junit.Assert; @@ -31,12 +39,19 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import io.netty.handler.ssl.OpenSslClientContext; +import io.netty.handler.ssl.OpenSslServerContext; +import io.netty.handler.ssl.OpenSslSessionContext; import io.netty.handler.ssl.SslContext; import io.netty.handler.ssl.util.SelfSignedCertificate; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.config.EncryptionOptions; import org.apache.cassandra.config.EncryptionOptions.ServerEncryptionOptions; import org.apache.cassandra.config.ParameterizedClass; +import org.apache.cassandra.io.util.File; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; public class SSLFactoryTest { @@ -65,13 +80,17 @@ public void setup() .withTrustStore("test/conf/cassandra_ssl_test.truststore") .withTrustStorePassword("cassandra") .withRequireClientAuth(false) - .withCipherSuites("TLS_RSA_WITH_AES_128_CBC_SHA"); + .withCipherSuites("TLS_RSA_WITH_AES_128_CBC_SHA") + .withSslContextFactory(new ParameterizedClass(TestFileBasedSSLContextFactory.class.getName(), + new HashMap<>())); } private ServerEncryptionOptions addKeystoreOptions(ServerEncryptionOptions options) { return options.withKeyStore("test/conf/cassandra_ssl_test.keystore") - .withKeyStorePassword("cassandra"); + .withKeyStorePassword("cassandra") + .withOutboundKeystore("test/conf/cassandra_ssl_test_outbound.keystore") + .withOutboundKeystorePassword("cassandra"); } private ServerEncryptionOptions addPEMKeystoreOptions(ServerEncryptionOptions options) @@ -81,6 +100,8 @@ private ServerEncryptionOptions addPEMKeystoreOptions(ServerEncryptionOptions op return options.withSslContextFactory(sslContextFactoryClass) .withKeyStore("test/conf/cassandra_ssl_test.keystore.pem") .withKeyStorePassword("cassandra") + .withOutboundKeystore("test/conf/cassandra_ssl_test.keystore.pem") + .withOutboundKeystorePassword("cassandra") .withTrustStore("test/conf/cassandra_ssl_test.truststore.pem"); } @@ -117,7 +138,41 @@ public void testSslContextReload_HappyPath() throws IOException, InterruptedExce } @Test - public void testPEMSslContextReload_HappyPath() throws IOException, InterruptedException + public void testServerSocketShouldUseKeystore() throws IOException, CertificateException, KeyStoreException, NoSuchAlgorithmException, NoSuchFieldException, IllegalAccessException, ClassNotFoundException, NoSuchMethodException, InvocationTargetException + { + ServerEncryptionOptions options = addKeystoreOptions(encryptionOptions) + .withOutboundKeystore("dummyKeystore") + .withOutboundKeystorePassword("dummyPassword"); + + // Server socket type should create a keystore with keystore & keystore password + final OpenSslServerContext context = (OpenSslServerContext) SSLFactory.createNettySslContext(options, true, ISslContextFactory.SocketType.SERVER); + assertNotNull(context); + + // Verify if right certificate is loaded into SslContext + final Certificate loadedCertificate = getCertificateLoadedInSslContext(context.sessionContext()); + final Certificate certificate = getCertificates("test/conf/cassandra_ssl_test.keystore", "cassandra"); + assertEquals(loadedCertificate, certificate); + } + + @Test + public void testClientSocketShouldUseOutboundKeystore() throws IOException, CertificateException, KeyStoreException, NoSuchAlgorithmException, NoSuchFieldException, ClassNotFoundException, InvocationTargetException, IllegalAccessException, NoSuchMethodException + { + ServerEncryptionOptions options = addKeystoreOptions(encryptionOptions) + .withKeyStore("dummyKeystore") + .withKeyStorePassword("dummyPassword"); + + // Client socket type should create a keystore with outbound Keystore & outbound password + final OpenSslClientContext context = (OpenSslClientContext) SSLFactory.createNettySslContext(options, true, ISslContextFactory.SocketType.CLIENT); + assertNotNull(context); + + // Verify if right certificate is loaded into SslContext + final Certificate loadedCertificate = getCertificateLoadedInSslContext(context.sessionContext()); + final Certificate certificate = getCertificates("test/conf/cassandra_ssl_test_outbound.keystore", "cassandra"); + assertEquals(loadedCertificate, certificate); + } + + @Test + public void testPEMSslContextReload_HappyPath() throws IOException { try { @@ -223,8 +278,7 @@ public void testSslFactoryHotReload_CorruptOrNonExistentFile_DoesNotClearExistin @Test public void getSslContext_ParamChanges() throws IOException { - EncryptionOptions options = addKeystoreOptions(encryptionOptions) - .withEnabled(true) + ServerEncryptionOptions options = addKeystoreOptions(encryptionOptions) .withCipherSuites("TLS_ECDHE_ECDSA_WITH_AES_128_GCM_SHA256"); SslContext ctx1 = SSLFactory.getOrCreateSslContext(options, true, @@ -301,4 +355,36 @@ public void testCacheKeyInequalityForCustomSslContextFactory() { Assert.assertNotEquals(cacheKey1, cacheKey2); } + + public static class TestFileBasedSSLContextFactory extends FileBasedSslContextFactory { + public TestFileBasedSSLContextFactory(Map parameters) + { + super(parameters); + } + } + + private static Certificate getCertificates(final String filename, final String password) throws KeyStoreException, IOException, CertificateException, NoSuchAlgorithmException + { + FileInputStream is = new FileInputStream(filename); + KeyStore keystore = KeyStore.getInstance(KeyStore.getDefaultType()); + char[] passwd = password.toCharArray(); + keystore.load(is, passwd); + return keystore.getCertificate("cassandra_ssl_test"); + } + + private static Certificate getCertificateLoadedInSslContext(final OpenSslSessionContext session) + throws ClassNotFoundException, InvocationTargetException, IllegalAccessException, NoSuchMethodException, NoSuchFieldException + { + Field providerField = OpenSslSessionContext.class.getDeclaredField("provider"); + providerField.setAccessible(true); + + Class keyMaterialProvider = Class.forName("io.netty.handler.ssl.OpenSslKeyMaterialProvider"); + Object provider = keyMaterialProvider.cast(providerField.get(session)); + + Method keyManager = provider.getClass().getDeclaredMethod("keyManager"); + keyManager.setAccessible(true); + X509KeyManager keyManager1 = (X509KeyManager) keyManager.invoke(provider); + final Certificate[] certificates = keyManager1.getCertificateChain("cassandra_ssl_test"); + return certificates[0]; + } } From 80d94245be9806f1e8ae0ab3c6d4d4d6898383f8 Mon Sep 17 00:00:00 2001 From: Brandon Williams Date: Thu, 31 Mar 2022 09:33:59 -0500 Subject: [PATCH 007/159] set perms on data/logs to 750 in redhat packaging Patch by brandonwilliams; reivewed by bereng for CASSANDRA-17470 --- redhat/cassandra.spec | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/redhat/cassandra.spec b/redhat/cassandra.spec index a3abaa6eceec..8c04fdb4cf5d 100644 --- a/redhat/cassandra.spec +++ b/redhat/cassandra.spec @@ -161,9 +161,9 @@ exit 0 %{_sysconfdir}/security/limits.d/%{username}.conf /usr/share/%{username}* %config(noreplace) /%{_sysconfdir}/%{username} -%attr(755,%{username},%{username}) %config(noreplace) /var/lib/%{username}/* -%attr(755,%{username},%{username}) /var/log/%{username}* -%attr(755,%{username},%{username}) /var/run/%{username}* +%attr(750,%{username},%{username}) %config(noreplace) /var/lib/%{username}/* +%attr(750,%{username},%{username}) /var/log/%{username}* +%attr(750,%{username},%{username}) /var/run/%{username}* %{python_sitelib}/cqlshlib/ %{python_sitelib}/cassandra_pylib*.egg-info From 39d372539f224eb1eaa3b6209610142abe74712c Mon Sep 17 00:00:00 2001 From: Brandon Williams Date: Wed, 30 Mar 2022 14:35:18 -0500 Subject: [PATCH 008/159] set perms on data/logs to 750 in debian packaging Patch by brandonwilliams; reviewed by bereng for CASSANDRA-17470 --- debian/cassandra.postinst | 2 ++ 1 file changed, 2 insertions(+) diff --git a/debian/cassandra.postinst b/debian/cassandra.postinst index 752ff1f6bbcd..95882e3ae225 100644 --- a/debian/cassandra.postinst +++ b/debian/cassandra.postinst @@ -37,6 +37,8 @@ case "$1" in if [ -z "$2" ]; then chown -R cassandra: /var/lib/cassandra chown -R cassandra: /var/log/cassandra + chmod 750 /var/lib/cassandra/ + chmod 750 /var/log/cassandra/ fi if ! sysctl -p /etc/sysctl.d/cassandra.conf; then echo >&2 From e9edf16c7f49cf81a26339470465d46964acb869 Mon Sep 17 00:00:00 2001 From: Brandon Williams Date: Wed, 25 May 2022 06:21:15 -0500 Subject: [PATCH 009/159] Update changes for 17470 --- CHANGES.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.txt b/CHANGES.txt index 0f64ce2612c1..c38906747d55 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Change default directory permission to 750 in packaging (CASSANDRA-17470) * Adding support for TLS client authentication for internode communication (CASSANDRA-17513) * Add new CQL function maxWritetime (CASSANDRA-17425) * Add guardrail for ALTER TABLE ADD / DROP / REMOVE column operations (CASSANDRA-17495) From 14fbab15bd264dd1cf894bf48170cf4f30ada8a0 Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Thu, 19 May 2022 15:48:20 -0400 Subject: [PATCH 010/159] Add guardrail to allow disabling SimpleStrategy Patch by Josh McKenzie; reviewed by Aleksey Yeschenko for CASSANDRA-17647 --- CHANGES.txt | 1 + NEWS.txt | 1 + conf/cassandra.yaml | 3 + .../org/apache/cassandra/config/Config.java | 1 + .../cassandra/config/GuardrailsOptions.java | 14 +++ .../schema/AlterKeyspaceStatement.java | 5 ++ .../schema/CreateKeyspaceStatement.java | 4 + .../statements/schema/KeyspaceAttributes.java | 2 +- .../cassandra/db/guardrails/Guardrails.java | 20 +++++ .../db/guardrails/GuardrailsConfig.java | 7 ++ .../db/guardrails/GuardrailsMBean.java | 14 +++ .../GuardrailSimpleStrategyTest.java | 89 +++++++++++++++++++ 12 files changed, 160 insertions(+), 1 deletion(-) create mode 100644 test/unit/org/apache/cassandra/db/guardrails/GuardrailSimpleStrategyTest.java diff --git a/CHANGES.txt b/CHANGES.txt index c38906747d55..6d9feb52ffbf 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Add guardrail to allow disabling of SimpleStrategy (CASSANDRA-17647) * Change default directory permission to 750 in packaging (CASSANDRA-17470) * Adding support for TLS client authentication for internode communication (CASSANDRA-17513) * Add new CQL function maxWritetime (CASSANDRA-17425) diff --git a/NEWS.txt b/NEWS.txt index e44b0e4fe83b..2b81a284adff 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -62,6 +62,7 @@ New features non-frozen collections and UDT, and returns the largest timestamp. One should not to use it when upgrading to 4.2. - New Guardrails added: - Whether ALTER TABLE commands are allowed to mutate columns + - Whether SimpleStrategy is allowed on keyspace creation or alteration Upgrading --------- diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml index 409110a02430..491740f012c6 100644 --- a/conf/cassandra.yaml +++ b/conf/cassandra.yaml @@ -1763,6 +1763,9 @@ drop_compact_storage_enabled: false # Guardrail to allow/disallow querying with ALLOW FILTERING. Defaults to true. # allow_filtering_enabled: true # +# Guardrail to allow/disallow setting SimpleStrategy via keyspace creation or alteration. Defaults to true. +# simplestrategy_enabled: true +# # Guardrail to warn or fail when creating a user-defined-type with more fields in than threshold. # Default -1 to disable. # fields_per_udt_warn_threshold: -1 diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java index 9f75a277bcbc..c3c5b3582cb4 100644 --- a/src/java/org/apache/cassandra/config/Config.java +++ b/src/java/org/apache/cassandra/config/Config.java @@ -833,6 +833,7 @@ public static void setClientMode(boolean clientMode) public volatile boolean compact_tables_enabled = true; public volatile boolean read_before_write_list_operations_enabled = true; public volatile boolean allow_filtering_enabled = true; + public volatile boolean simplestrategy_enabled = true; public volatile DataStorageSpec.LongBytesBound collection_size_warn_threshold = null; public volatile DataStorageSpec.LongBytesBound collection_size_fail_threshold = null; public volatile int items_per_collection_warn_threshold = -1; diff --git a/src/java/org/apache/cassandra/config/GuardrailsOptions.java b/src/java/org/apache/cassandra/config/GuardrailsOptions.java index b14f428ca452..e8d7bda77a16 100644 --- a/src/java/org/apache/cassandra/config/GuardrailsOptions.java +++ b/src/java/org/apache/cassandra/config/GuardrailsOptions.java @@ -427,6 +427,20 @@ public void setAllowFilteringEnabled(boolean enabled) x -> config.allow_filtering_enabled = x); } + @Override + public boolean getSimpleStrategyEnabled() + { + return config.simplestrategy_enabled; + } + + public void setSimpleStrategyEnabled(boolean enabled) + { + updatePropertyWithLogging("simplestrategy_enabled", + enabled, + () -> config.simplestrategy_enabled, + x -> config.simplestrategy_enabled = x); + } + @Override public int getInSelectCartesianProductWarnThreshold() { diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/AlterKeyspaceStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/AlterKeyspaceStatement.java index 87377d70ec37..dec0655e74a4 100644 --- a/src/java/org/apache/cassandra/cql3/statements/schema/AlterKeyspaceStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/schema/AlterKeyspaceStatement.java @@ -31,12 +31,14 @@ import org.apache.cassandra.cql3.CQLStatement; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.guardrails.Guardrails; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.gms.Gossiper; import org.apache.cassandra.locator.AbstractReplicationStrategy; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.LocalStrategy; import org.apache.cassandra.locator.ReplicationFactor; +import org.apache.cassandra.locator.SimpleStrategy; import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.KeyspaceMetadata.KeyspaceDiff; import org.apache.cassandra.schema.Keyspaces; @@ -76,6 +78,9 @@ public Keyspaces apply(Keyspaces schema) KeyspaceMetadata newKeyspace = keyspace.withSwapped(attrs.asAlteredKeyspaceParams(keyspace.params)); + if (attrs.getReplicationStrategyClass() != null && attrs.getReplicationStrategyClass().equals(SimpleStrategy.class.getSimpleName())) + Guardrails.simpleStrategyEnabled.ensureEnabled(state); + if (newKeyspace.params.replication.klass.equals(LocalStrategy.class)) throw ire("Unable to use given strategy class: LocalStrategy is reserved for internal use."); diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/CreateKeyspaceStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/CreateKeyspaceStatement.java index dc82f93a1095..ad6bcc472d79 100644 --- a/src/java/org/apache/cassandra/cql3/statements/schema/CreateKeyspaceStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/schema/CreateKeyspaceStatement.java @@ -36,6 +36,7 @@ import org.apache.cassandra.db.guardrails.Guardrails; import org.apache.cassandra.exceptions.AlreadyExistsException; import org.apache.cassandra.locator.LocalStrategy; +import org.apache.cassandra.locator.SimpleStrategy; import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.KeyspaceParams.Option; import org.apache.cassandra.schema.Keyspaces; @@ -67,6 +68,9 @@ public Keyspaces apply(Keyspaces schema) if (!attrs.hasOption(Option.REPLICATION)) throw ire("Missing mandatory option '%s'", Option.REPLICATION); + if (attrs.getReplicationStrategyClass() != null && attrs.getReplicationStrategyClass().equals(SimpleStrategy.class.getSimpleName())) + Guardrails.simpleStrategyEnabled.ensureEnabled("SimpleStrategy", state); + if (schema.containsKeyspace(keyspaceName)) { if (ifNotExists) diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/KeyspaceAttributes.java b/src/java/org/apache/cassandra/cql3/statements/schema/KeyspaceAttributes.java index 42fcaf4e69e8..d4d5b984b3c3 100644 --- a/src/java/org/apache/cassandra/cql3/statements/schema/KeyspaceAttributes.java +++ b/src/java/org/apache/cassandra/cql3/statements/schema/KeyspaceAttributes.java @@ -50,7 +50,7 @@ public void validate() throw new ConfigurationException("Missing replication strategy class"); } - private String getReplicationStrategyClass() + public String getReplicationStrategyClass() { return getAllReplicationOptions().get(ReplicationParams.CLASS); } diff --git a/src/java/org/apache/cassandra/db/guardrails/Guardrails.java b/src/java/org/apache/cassandra/db/guardrails/Guardrails.java index 57dd2af5ceb2..16146fec87a4 100644 --- a/src/java/org/apache/cassandra/db/guardrails/Guardrails.java +++ b/src/java/org/apache/cassandra/db/guardrails/Guardrails.java @@ -218,6 +218,14 @@ public final class Guardrails implements GuardrailsMBean state -> CONFIG_PROVIDER.getOrCreate(state).getAllowFilteringEnabled(), "Querying with ALLOW FILTERING"); + /** + * Guardrail disabling setting SimpleStrategy via keyspace creation or alteration + */ + public static final EnableFlag simpleStrategyEnabled = + new EnableFlag("simplestrategy", + state -> CONFIG_PROVIDER.getOrCreate(state).getSimpleStrategyEnabled(), + "SimpleStrategy"); + /** * Guardrail on the number of restrictions created by a cartesian product of a CQL's {@code IN} query. */ @@ -571,6 +579,18 @@ public void setAllowFilteringEnabled(boolean enabled) DEFAULT_CONFIG.setAllowFilteringEnabled(enabled); } + @Override + public boolean getSimpleStrategyEnabled() + { + return DEFAULT_CONFIG.getSimpleStrategyEnabled(); + } + + @Override + public void setSimpleStrategyEnabled(boolean enabled) + { + DEFAULT_CONFIG.setSimpleStrategyEnabled(enabled); + } + @Override public boolean getUncompressedTablesEnabled() { diff --git a/src/java/org/apache/cassandra/db/guardrails/GuardrailsConfig.java b/src/java/org/apache/cassandra/db/guardrails/GuardrailsConfig.java index 7f38e80ec360..72eaaa5b487a 100644 --- a/src/java/org/apache/cassandra/db/guardrails/GuardrailsConfig.java +++ b/src/java/org/apache/cassandra/db/guardrails/GuardrailsConfig.java @@ -191,6 +191,13 @@ public interface GuardrailsConfig */ boolean getAllowFilteringEnabled(); + /** + * Returns whether setting SimpleStrategy via keyspace creation or alteration is enabled + * + * @return {@code true} if SimpleStrategy is allowed, {@code false} otherwise. + */ + boolean getSimpleStrategyEnabled(); + /** * @return The threshold to warn when an IN query creates a cartesian product with a size exceeding threshold. * -1 means disabled. diff --git a/src/java/org/apache/cassandra/db/guardrails/GuardrailsMBean.java b/src/java/org/apache/cassandra/db/guardrails/GuardrailsMBean.java index 30be464d023c..47db91a6fa85 100644 --- a/src/java/org/apache/cassandra/db/guardrails/GuardrailsMBean.java +++ b/src/java/org/apache/cassandra/db/guardrails/GuardrailsMBean.java @@ -222,6 +222,20 @@ public interface GuardrailsMBean */ void setAllowFilteringEnabled(boolean enabled); + /** + * Returns whether SimpleStrategy is allowed on keyspace creation or alteration + * + * @return {@code true} if SimpleStrategy is allowed; {@code false} otherwise + */ + boolean getSimpleStrategyEnabled(); + + /** + * Sets whether SimpleStrategy is allowed on keyspace creation or alteration + * + * @param enabled {@code true} if SimpleStrategy is allowed, {@code false} otherwise. + */ + void setSimpleStrategyEnabled(boolean enabled); + /** * Returns whether users can disable compression on tables * diff --git a/test/unit/org/apache/cassandra/db/guardrails/GuardrailSimpleStrategyTest.java b/test/unit/org/apache/cassandra/db/guardrails/GuardrailSimpleStrategyTest.java new file mode 100644 index 000000000000..3cc6bc747d5b --- /dev/null +++ b/test/unit/org/apache/cassandra/db/guardrails/GuardrailSimpleStrategyTest.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.guardrails; + +import org.junit.After; +import org.junit.Test; + +public class GuardrailSimpleStrategyTest extends GuardrailTester +{ + public static String ERROR_MSG = "SimpleStrategy is not allowed"; + + public GuardrailSimpleStrategyTest() + { + super(Guardrails.simpleStrategyEnabled); + } + + private void setGuardrail(boolean enabled) + { + guardrails().setSimpleStrategyEnabled(enabled); + } + + @After + public void afterTest() throws Throwable + { + setGuardrail(true); + execute("DROP KEYSPACE IF EXISTS test_ss;"); + } + + @Test + public void testCanCreateWithGuardrailEnabled() throws Throwable + { + assertValid("CREATE KEYSPACE test_ss WITH replication = {'class': 'SimpleStrategy'};"); + } + + @Test + public void testCanAlterWithGuardrailEnabled() throws Throwable + { + execute("CREATE KEYSPACE test_ss WITH replication = {'class': 'NetworkTopologyStrategy', 'datacenter1':2, 'datacenter2':0};"); + assertValid("ALTER KEYSPACE test_ss WITH replication = {'class': 'SimpleStrategy'};"); + } + + @Test + public void testGuardrailBlocksCreate() throws Throwable + { + setGuardrail(false); + assertFails("CREATE KEYSPACE test_ss WITH replication = {'class': 'SimpleStrategy'};", ERROR_MSG); + } + + @Test + public void testGuardrailBlocksAlter() throws Throwable + { + setGuardrail(false); + execute("CREATE KEYSPACE test_ss WITH replication = {'class': 'NetworkTopologyStrategy', 'datacenter1':2, 'datacenter2':0};"); + assertFails("ALTER KEYSPACE test_ss WITH replication = {'class': 'SimpleStrategy'};", ERROR_MSG); + } + + @Test + public void testToggle() throws Throwable + { + setGuardrail(false); + assertFails("CREATE KEYSPACE test_ss WITH replication = {'class': 'SimpleStrategy'};", ERROR_MSG); + + setGuardrail(true); + assertValid("CREATE KEYSPACE test_ss WITH replication = {'class': 'SimpleStrategy'};"); + execute("ALTER KEYSPACE test_ss WITH replication = {'class': 'NetworkTopologyStrategy', 'datacenter1':2, 'datacenter2':0};"); + + setGuardrail(false); + assertFails("ALTER KEYSPACE test_ss WITH replication = {'class': 'SimpleStrategy'};", ERROR_MSG); + + setGuardrail(true); + assertValid("ALTER KEYSPACE test_ss WITH replication = {'class': 'SimpleStrategy'};"); + } +} From 72d5b4d1b117a4a86cd578197ecdba667aa56343 Mon Sep 17 00:00:00 2001 From: Brad Schoening <5796692+bschoening@users.noreply.github.com> Date: Thu, 12 May 2022 20:03:25 -0400 Subject: [PATCH 011/159] add CQLSH command SHOW REPLICAS patch by Brad Schoening; reviewed by Stefan Miklosovic and Brandon Williams for CASSANDRA-17577 --- CHANGES.txt | 1 + bin/cqlsh.py | 18 +++++++++++++++++- doc/modules/cassandra/pages/tools/cqlsh.adoc | 15 +++++++++++++++ pylib/cqlshlib/cqlshhandling.py | 2 +- 4 files changed, 34 insertions(+), 2 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 6d9feb52ffbf..a7c8c576d6d5 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Add CQLSH command SHOW REPLICAS (CASSANDRA-17577) * Add guardrail to allow disabling of SimpleStrategy (CASSANDRA-17647) * Change default directory permission to 750 in packaging (CASSANDRA-17470) * Adding support for TLS client authentication for internode communication (CASSANDRA-17513) diff --git a/bin/cqlsh.py b/bin/cqlsh.py index 637c95e70aa2..35eb429abaeb 100755 --- a/bin/cqlsh.py +++ b/bin/cqlsh.py @@ -600,6 +600,13 @@ def show_version(self): def show_session(self, sessionid, partial_session=False): print_trace_session(self, self.session, sessionid, partial_session) + def show_replicas(self, token_value, keyspace=None): + ks = self.current_keyspace if keyspace is None else keyspace + token_map = self.conn.metadata.token_map + nodes = token_map.get_replicas(ks, token_map.token_class(token_value)) + addresses = [x.address for x in nodes] + print(f"{addresses}") + def get_connection_versions(self): result, = self.session.execute("select * from system.local where key = 'local'") vers = { @@ -979,7 +986,7 @@ def handle_parse_error(self, cmdword, tokens, parsed, srcstr): if parsed: self.printerr('Improper %s command (problem at %r).' % (cmdword, parsed.remainder[0])) else: - self.printerr('Improper %s command.' % cmdword) + self.printerr(f'Improper {cmdword} command.') def do_use(self, parsed): ksname = parsed.get_binding('ksname') @@ -1578,6 +1585,11 @@ def do_show(self, parsed): SHOW SESSION Pretty-prints the requested tracing session. + + SHOW REPLICAS () + + Lists the replica nodes by IP address for the given token. The current + keyspace is used if one is not specified. """ showwhat = parsed.get_binding('what').lower() if showwhat == 'version': @@ -1588,6 +1600,10 @@ def do_show(self, parsed): elif showwhat.startswith('session'): session_id = parsed.get_binding('sessionid').lower() self.show_session(UUID(session_id)) + elif showwhat.startswith('replicas'): + token_id = parsed.get_binding('token') + keyspace = parsed.get_binding('keyspace') + self.show_replicas(token_id, keyspace) else: self.printerr('Wait, how do I show %r?' % (showwhat,)) diff --git a/doc/modules/cassandra/pages/tools/cqlsh.adoc b/doc/modules/cassandra/pages/tools/cqlsh.adoc index 8050ee5df04d..0d40608c2c5d 100644 --- a/doc/modules/cassandra/pages/tools/cqlsh.adoc +++ b/doc/modules/cassandra/pages/tools/cqlsh.adoc @@ -181,6 +181,21 @@ cqlsh> SHOW HOST Connected to Prod_Cluster at 192.0.0.1:9042. ---- +=== `SHOW REPLICAS` + +Prints the IP addresses of the Cassandra nodes which are replicas for the +listed given token and keyspace. This command is available from Cassandra 4.2. + +`Usage`: `SHOW REPLICAS ()` + +Example usage: + +[source,none] +---- +cqlsh> SHOW REPLICAS 95 +['192.0.0.1', '192.0.0.2'] +---- + === `SHOW SESSION` Pretty prints a specific tracing session. diff --git a/pylib/cqlshlib/cqlshhandling.py b/pylib/cqlshlib/cqlshhandling.py index aa1fbc01839e..cc8590a44f44 100644 --- a/pylib/cqlshlib/cqlshhandling.py +++ b/pylib/cqlshlib/cqlshhandling.py @@ -131,7 +131,7 @@ def registrator(f): ''' cqlsh_show_cmd_syntax_rules = r''' - ::= "SHOW" what=( "VERSION" | "HOST" | "SESSION" sessionid= ) + ::= "SHOW" what=( "VERSION" | "HOST" | "SESSION" sessionid= | "REPLICAS" token= (keyspace=)? ) ; ''' From 740cec41d2d67783a463bd18f70221de331928df Mon Sep 17 00:00:00 2001 From: David Capwell Date: Wed, 1 Jun 2022 08:49:44 -0700 Subject: [PATCH 012/159] When a node is bootstrapping it gets the whole gossip state but applies in random order causing some cases where StorageService will fail causing an instance to not show up in TokenMetadata patch by David Capwell; reviewed by Blake Eggleston for CASSANDRA-17676 --- CHANGES.txt | 1 + .../org/apache/cassandra/gms/Gossiper.java | 46 ++++++++++++++++++- .../apache/cassandra/gms/VersionedValue.java | 3 ++ 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/CHANGES.txt b/CHANGES.txt index fb75bc510dbb..30dfdf763c47 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * When a node is bootstrapping it gets the whole gossip state but applies in random order causing some cases where StorageService will fail causing an instance to not show up in TokenMetadata (CASSANDRA-17676) * Add CQLSH command SHOW REPLICAS (CASSANDRA-17577) * Add guardrail to allow disabling of SimpleStrategy (CASSANDRA-17647) * Change default directory permission to 750 in packaging (CASSANDRA-17470) diff --git a/src/java/org/apache/cassandra/gms/Gossiper.java b/src/java/org/apache/cassandra/gms/Gossiper.java index 4c72166a9d15..8041eed034b6 100644 --- a/src/java/org/apache/cassandra/gms/Gossiper.java +++ b/src/java/org/apache/cassandra/gms/Gossiper.java @@ -85,6 +85,7 @@ import static org.apache.cassandra.utils.FBUtilities.getBroadcastAddressAndPort; import static org.apache.cassandra.utils.Clock.Global.currentTimeMillis; import static org.apache.cassandra.utils.Clock.Global.nanoTime; +import static org.apache.cassandra.gms.VersionedValue.BOOTSTRAPPING_STATUS; /** * This module is responsible for Gossiping information for the local endpoint. This abstraction @@ -1500,11 +1501,54 @@ private static String getGossipStatus(EndpointState epState) return pieces[0]; } + /** + * Gossip offers no happens-before relationship, but downstream subscribers assume a happens-before relationship + * before being notified! To attempt to be nicer to subscribers, this {@link Comparator} attempts to order EndpointState + * within a map based off a few heuristics: + *

    + *
  1. STATUS - some STATUS depends on other instance STATUS, so make sure they are last; eg. BOOT, and BOOT_REPLACE
  2. + *
  3. generation - normally defined as system clock millis, this can be skewed and is a best effort
  4. + *
  5. address - tie breaker to make sure order is consistent
  6. + *
+ *

+ * Problems: + * Generation is normally defined as system clock millis, which can be skewed and in-consistent cross nodes + * (generations do not have a happens-before relationship, so ordering is sketchy at best). + *

+ * Motivations: + * {@link Map#entrySet()} returns data in effectivlly random order, so can get into a situation such as the following example. + * {@code + * 3 node cluster: n1, n2, and n3 + * n2 goes down and n4 does host replacement and fails before completion + * h5 tries to do a host replacement against n4 (ignore the fact this doesn't make sense) + * } + * In that case above, the {@link Map#entrySet()} ordering can be random, causing h4 to apply before h2, which will + * be rejected by subscripers (only after updating gossip causing zero retries). + */ + private static Comparator> STATE_MAP_ORDERING = + ((Comparator>) (e1, e2) -> { + // check status first, make sure bootstrap status happens-after all others + if (BOOTSTRAPPING_STATUS.contains(getGossipStatus(e1.getValue()))) + return 1; + if (BOOTSTRAPPING_STATUS.contains(getGossipStatus(e2.getValue()))) + return -1; + return 0; + }) + .thenComparingInt((Entry e) -> e.getValue().getHeartBeatState().getGeneration()) + .thenComparing(Entry::getKey); + + private static Iterable> order(Map epStateMap) + { + List> list = new ArrayList<>(epStateMap.entrySet()); + Collections.sort(list, STATE_MAP_ORDERING); + return list; + } + @VisibleForTesting public void applyStateLocally(Map epStateMap) { checkProperThreadForStateMutation(); - for (Entry entry : epStateMap.entrySet()) + for (Entry entry : order(epStateMap)) { InetAddressAndPort ep = entry.getKey(); if (ep.equals(getBroadcastAddressAndPort()) && !isInShadowRound()) diff --git a/src/java/org/apache/cassandra/gms/VersionedValue.java b/src/java/org/apache/cassandra/gms/VersionedValue.java index 26644e17cc91..519fffa9b817 100644 --- a/src/java/org/apache/cassandra/gms/VersionedValue.java +++ b/src/java/org/apache/cassandra/gms/VersionedValue.java @@ -27,6 +27,7 @@ import static java.nio.charset.StandardCharsets.ISO_8859_1; import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.ImmutableSet; import com.google.common.collect.Iterables; import org.apache.cassandra.db.TypeSizes; @@ -84,6 +85,8 @@ public class VersionedValue implements Comparable // values for ApplicationState.REMOVAL_COORDINATOR public final static String REMOVAL_COORDINATOR = "REMOVER"; + public static Set BOOTSTRAPPING_STATUS = ImmutableSet.of(STATUS_BOOTSTRAPPING, STATUS_BOOTSTRAPPING_REPLACE); + public final int version; public final String value; From 6247c9d966d7e3886fef0c7486013578407b37e4 Mon Sep 17 00:00:00 2001 From: David Capwell Date: Mon, 6 Jun 2022 13:42:00 -0700 Subject: [PATCH 013/159] jvm-dtest upgrade tests run all supported pairs of upgrades between from/to but does not actually test all patches from/to patch by David Capwell; reviewed by Jon Meredith, Michael Semb Wever for CASSANDRA-17656 --- .../distributed/upgrade/BatchUpgradeTest.java | 2 +- .../CompactStorageColumnDeleteTest.java | 2 +- .../CompactStorageHiddenColumnTest.java | 2 +- ...ctStorageImplicitNullInClusteringTest.java | 2 +- .../upgrade/CompactStoragePagingTest.java | 2 +- ...ompactStorageNullClusteringValuesTest.java | 2 +- .../upgrade/DropCompactStorageTest.java | 2 +- .../distributed/upgrade/GroupByTest.java | 3 +- .../MixedModeAvailabilityTestBase.java | 36 ++-- .../upgrade/MixedModeAvailabilityV30Test.java | 9 +- .../upgrade/MixedModeAvailabilityV3XTest.java | 9 +- .../upgrade/MixedModeConsistencyTestBase.java | 7 +- .../MixedModeFrom3ReplicationTest.java | 62 +++++- .../upgrade/MixedModeGossipTest.java | 9 +- .../upgrade/MixedModeMessageForwardTest.java | 2 +- .../MixedModeReadRepairDeleteTest.java | 6 +- .../upgrade/MixedModeReadRepairWriteTest.java | 4 +- .../upgrade/MixedModeReadTest.java | 5 +- .../upgrade/MixedModeRepairTest.java | 2 +- .../upgrade/MixedModeReplicationTestBase.java | 89 -------- .../upgrade/Pre40MessageFilterTest.java | 4 +- .../distributed/upgrade/UpgradeTest.java | 4 +- .../distributed/upgrade/UpgradeTestBase.java | 195 ++++++++++++++---- .../apache/cassandra/utils/SimpleGraph.java | 126 +++++++++++ .../cassandra/utils/SimpleGraphTest.java | 108 ++++++++++ 25 files changed, 498 insertions(+), 196 deletions(-) delete mode 100644 test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeReplicationTestBase.java create mode 100644 test/unit/org/apache/cassandra/utils/SimpleGraph.java create mode 100644 test/unit/org/apache/cassandra/utils/SimpleGraphTest.java diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/BatchUpgradeTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/BatchUpgradeTest.java index fb442331bfbf..3eb72f3add57 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/BatchUpgradeTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/BatchUpgradeTest.java @@ -33,7 +33,7 @@ public void batchTest() throws Throwable .nodes(2) .nodesToUpgrade(2) - .upgradesFrom(v40).setup((cluster) -> { + .upgradesToCurrentFrom(v40).setup((cluster) -> { cluster.schemaChange("CREATE TABLE "+KEYSPACE+".users (" + "userid uuid PRIMARY KEY," + "firstname ascii," + diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStorageColumnDeleteTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStorageColumnDeleteTest.java index 720a1b5db07c..920c8508a9b6 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStorageColumnDeleteTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStorageColumnDeleteTest.java @@ -33,7 +33,7 @@ public void testColumnDeleteWithCompactStorage() throws Throwable new TestCase() .nodes(2) .nodesToUpgrade(2) - .upgradesFrom(v30) + .upgradesToCurrentFrom(v30) .setup((cluster) -> { cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck)) WITH COMPACT STORAGE"); }) diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStorageHiddenColumnTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStorageHiddenColumnTest.java index 4e5236c4a844..178d32800ba8 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStorageHiddenColumnTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStorageHiddenColumnTest.java @@ -33,7 +33,7 @@ public void testHiddenColumnWithCompactStorage() throws Throwable new TestCase() .nodes(2) .nodesToUpgrade(2) - .upgradesFrom(v30) + .upgradesToCurrentFrom(v30) .setup((cluster) -> { cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, PRIMARY KEY (pk, ck)) WITH COMPACT STORAGE"); }) diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStorageImplicitNullInClusteringTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStorageImplicitNullInClusteringTest.java index 9d4824ae5b61..b59fda315149 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStorageImplicitNullInClusteringTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStorageImplicitNullInClusteringTest.java @@ -33,7 +33,7 @@ public void testImplicitNullInClusteringWithCompactStorage() throws Throwable new TestCase() .nodes(2) .nodesToUpgrade(2) - .upgradesFrom(v30) + .upgradesToCurrentFrom(v30) .setup((cluster) -> { cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck1 int, ck2 int, v int, PRIMARY KEY (pk, ck1, ck2)) WITH COMPACT STORAGE"); }) diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStoragePagingTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStoragePagingTest.java index 307d6dd7fe7d..62d6ea04e5e1 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStoragePagingTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/CompactStoragePagingTest.java @@ -33,7 +33,7 @@ public void testPagingWithCompactStorage() throws Throwable new TestCase() .nodes(2) .nodesToUpgrade(2) - .upgradesFrom(v30) + .upgradesToCurrentFrom(v30) .setup((cluster) -> { cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck)) WITH COMPACT STORAGE"); for (int i = 1; i < 10; i++) diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/DropCompactStorageNullClusteringValuesTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/DropCompactStorageNullClusteringValuesTest.java index 1657765b06be..2e5578d2c24f 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/DropCompactStorageNullClusteringValuesTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/DropCompactStorageNullClusteringValuesTest.java @@ -33,7 +33,7 @@ public class DropCompactStorageNullClusteringValuesTest extends UpgradeTestBase public void testNullClusteringValues() throws Throwable { new TestCase().nodes(1) - .upgradesFrom(v30) + .upgradesToCurrentFrom(v30) .withConfig(config -> config.with(GOSSIP, NETWORK, NATIVE_PROTOCOL).set("enable_drop_compact_storage", true)) .setup(cluster -> { String create = "CREATE TABLE %s.%s(k int, c1 int, c2 int, v int, PRIMARY KEY (k, c1, c2)) " + diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/DropCompactStorageTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/DropCompactStorageTest.java index c645085bd0dd..9846264e56e6 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/DropCompactStorageTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/DropCompactStorageTest.java @@ -35,7 +35,7 @@ public void testDropCompactStorage() throws Throwable new TestCase() .nodes(2) .nodesToUpgrade(1, 2) - .upgradesFrom(v30) + .upgradesToCurrentFrom(v30) .withConfig(config -> config.with(GOSSIP, NETWORK).set("enable_drop_compact_storage", true)) .setup((cluster) -> { cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, PRIMARY KEY (pk, ck)) WITH COMPACT STORAGE"); diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/GroupByTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/GroupByTest.java index 634c886f1356..b2971cd15f5a 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/GroupByTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/GroupByTest.java @@ -21,7 +21,6 @@ import org.junit.Test; import org.apache.cassandra.distributed.api.ConsistencyLevel; -import org.apache.cassandra.distributed.shared.Versions; import static org.apache.cassandra.distributed.api.Feature.GOSSIP; import static org.apache.cassandra.distributed.api.Feature.NETWORK; @@ -36,7 +35,7 @@ public void testReads() throws Throwable // CASSANDRA-16582: group-by across mixed version cluster would fail with ArrayIndexOutOfBoundException new UpgradeTestBase.TestCase() .nodes(2) - .upgradesFrom(v3X) + .upgradesToCurrentFrom(v3X) .nodesToUpgrade(1) .withConfig(config -> config.with(GOSSIP, NETWORK)) .setup(cluster -> { diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeAvailabilityTestBase.java b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeAvailabilityTestBase.java index c1ae15352557..3c15032e1949 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeAvailabilityTestBase.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeAvailabilityTestBase.java @@ -22,14 +22,16 @@ import java.util.List; import java.util.UUID; -import com.vdurmont.semver4j.Semver; +import org.junit.Test; +import com.vdurmont.semver4j.Semver; import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.distributed.api.ICoordinator; import org.apache.cassandra.exceptions.ReadTimeoutException; import org.apache.cassandra.exceptions.WriteTimeoutException; import org.apache.cassandra.net.Verb; +import static java.lang.String.format; import static java.util.concurrent.TimeUnit.SECONDS; import static org.apache.cassandra.distributed.api.ConsistencyLevel.ALL; import static org.apache.cassandra.distributed.api.ConsistencyLevel.ONE; @@ -39,10 +41,9 @@ import static org.apache.cassandra.net.Verb.READ_REQ; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; -import static java.lang.String.format; -public class MixedModeAvailabilityTestBase extends UpgradeTestBase +public abstract class MixedModeAvailabilityTestBase extends UpgradeTestBase { private static final int NUM_NODES = 3; private static final int COORDINATOR = 1; @@ -50,29 +51,40 @@ public class MixedModeAvailabilityTestBase extends UpgradeTestBase new Tester(QUORUM, QUORUM), new Tester(ALL, ONE)); + private final Semver initial; + + protected MixedModeAvailabilityTestBase(Semver initial) + { + this.initial = initial; + } - protected static void testAvailability(Semver initial) throws Throwable + @Test + public void testAvailabilityCoordinatorNotUpgraded() throws Throwable { - testAvailability(initial, UpgradeTestBase.CURRENT); + testAvailability(false, initial); } - protected static void testAvailability(Semver initial, Semver upgrade) throws Throwable + @Test + public void testAvailabilityCoordinatorUpgraded() throws Throwable { - testAvailability(true, initial, upgrade); - testAvailability(false, initial, upgrade); + testAvailability(true, initial); } private static void testAvailability(boolean upgradedCoordinator, - Semver initial, - Semver upgrade) throws Throwable + Semver initial) throws Throwable { new TestCase() .nodes(NUM_NODES) .nodesToUpgrade(upgradedCoordinator ? 1 : 2) - .upgrades(initial, upgrade) + .upgradesToCurrentFrom(initial) .withConfig(config -> config.set("read_request_timeout_in_ms", SECONDS.toMillis(2)) .set("write_request_timeout_in_ms", SECONDS.toMillis(2))) - .setup(c -> c.schemaChange(withKeyspace("CREATE TABLE %s.t (k uuid, c int, v int, PRIMARY KEY (k, c))"))) + // use retry of 10ms so that each check is consistent + // At the start of the world cfs.sampleLatencyNanos == 0, which means speculation acts as if ALWAYS is done, + // but after the first refresh this gets set high enough that we don't trigger speculation for the rest of the test! + // To be consistent set retry to 10ms so cfs.sampleLatencyNanos stays consistent for the duration of the test. + .setup(c -> c.schemaChange(withKeyspace("CREATE TABLE %s.t (k uuid, c int, v int, PRIMARY KEY (k, c)) WITH speculative_retry = '10ms'"))) + .runBeforeClusterUpgrade(cluster -> cluster.filters().reset()) .runAfterNodeUpgrade((cluster, n) -> { // using 0 to 2 down nodes... diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeAvailabilityV30Test.java b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeAvailabilityV30Test.java index 984df3ba9f16..d656958fcd62 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeAvailabilityV30Test.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeAvailabilityV30Test.java @@ -18,18 +18,13 @@ package org.apache.cassandra.distributed.upgrade; -import org.junit.Test; - -import org.apache.cassandra.distributed.shared.Versions; - /** * {@link MixedModeAvailabilityTestBase} for upgrades from v30. */ public class MixedModeAvailabilityV30Test extends MixedModeAvailabilityTestBase { - @Test - public void testAvailability() throws Throwable + public MixedModeAvailabilityV30Test() { - testAvailability(v30); + super(v30); } } diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeAvailabilityV3XTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeAvailabilityV3XTest.java index 70230f5f0ab8..16d17655c3fa 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeAvailabilityV3XTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeAvailabilityV3XTest.java @@ -18,18 +18,13 @@ package org.apache.cassandra.distributed.upgrade; -import org.junit.Test; - -import org.apache.cassandra.distributed.shared.Versions; - /** * {@link MixedModeAvailabilityTestBase} for upgrades from v3X. */ public class MixedModeAvailabilityV3XTest extends MixedModeAvailabilityTestBase { - @Test - public void testAvailability() throws Throwable + public MixedModeAvailabilityV3XTest() { - testAvailability(v3X); + super(v3X); } } diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeConsistencyTestBase.java b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeConsistencyTestBase.java index f98fc8a71b41..519625e76194 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeConsistencyTestBase.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeConsistencyTestBase.java @@ -41,11 +41,6 @@ public class MixedModeConsistencyTestBase extends UpgradeTestBase { protected static void testConsistency(Semver initial) throws Throwable - { - testConsistency(initial, UpgradeTestBase.CURRENT); - } - - protected static void testConsistency(Semver initial, Semver upgrade) throws Throwable { List testers = new ArrayList<>(); testers.addAll(Tester.create(1, ALL)); @@ -55,7 +50,7 @@ protected static void testConsistency(Semver initial, Semver upgrade) throws Thr new TestCase() .nodes(3) .nodesToUpgrade(1) - .upgrades(initial, upgrade) + .upgradesToCurrentFrom(initial) .withConfig(config -> config.set("read_request_timeout_in_ms", SECONDS.toMillis(30)) .set("write_request_timeout_in_ms", SECONDS.toMillis(30))) .setup(cluster -> { diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeFrom3ReplicationTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeFrom3ReplicationTest.java index a38e25d4b55e..69d3dbec710e 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeFrom3ReplicationTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeFrom3ReplicationTest.java @@ -18,21 +18,65 @@ package org.apache.cassandra.distributed.upgrade; +import java.util.ArrayList; +import java.util.List; + import org.junit.Test; -import org.apache.cassandra.distributed.shared.Versions; +import org.apache.cassandra.distributed.api.ConsistencyLevel; -public class MixedModeFrom3ReplicationTest extends MixedModeReplicationTestBase -{ - @Test - public void testSimpleStrategy30to3X() throws Throwable - { - testSimpleStrategy(v30, v3X); - } +import static org.apache.cassandra.distributed.shared.AssertUtils.assertRows; +import static org.apache.cassandra.distributed.shared.AssertUtils.row; +public class MixedModeFrom3ReplicationTest extends UpgradeTestBase +{ @Test public void testSimpleStrategy() throws Throwable { - testSimpleStrategy(v30); + String insert = "INSERT INTO test_simple.names (key, name) VALUES (?, ?)"; + String select = "SELECT * FROM test_simple.names WHERE key = ?"; + + new TestCase() + .nodes(3) + .nodesToUpgrade(1, 2) + .upgradesToCurrentFrom(v30) + .setup(cluster -> { + cluster.schemaChange("CREATE KEYSPACE test_simple WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 2};"); + cluster.schemaChange("CREATE TABLE test_simple.names (key int PRIMARY KEY, name text)"); + }) + .runAfterNodeUpgrade((cluster, upgraded) -> { + List initialTokens = new ArrayList<>(cluster.size() + 1); + initialTokens.add(null); // The first valid token is at 1 to avoid offset math below. + + for (int i = 1; i <= cluster.size(); i++) + initialTokens.add(Long.valueOf(cluster.get(i).config().get("initial_token").toString())); + + List validTokens = initialTokens.subList(1, cluster.size() + 1); + + // Exercise all the coordinators... + for (int i = 1; i <= cluster.size(); i++) + { + // ...and sample enough keys that we cover the ring. + for (int j = 0; j < 10; j++) + { + int key = j + (i * 10); + Object[] row = row(key, "Nero"); + Long token = tokenFrom(key); + + cluster.coordinator(i).execute(insert, ConsistencyLevel.ALL, row); + + int node = primaryReplica(validTokens, token); + assertRows(cluster.get(node).executeInternal(select, key), row); + + node = nextNode(node, cluster.size()); + assertRows(cluster.get(node).executeInternal(select, key), row); + + // At RF=2, this node should not have received the write. + node = nextNode(node, cluster.size()); + assertRows(cluster.get(node).executeInternal(select, key)); + } + } + }) + .run(); } } diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeGossipTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeGossipTest.java index 35c4fb381597..e1a96ac88c57 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeGossipTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeGossipTest.java @@ -33,7 +33,6 @@ import org.apache.cassandra.distributed.UpgradeableCluster; import org.apache.cassandra.distributed.api.Feature; import org.apache.cassandra.distributed.api.IMessageFilters; -import org.apache.cassandra.distributed.shared.Versions; import org.apache.cassandra.net.Verb; import org.assertj.core.api.Assertions; @@ -51,8 +50,8 @@ public void testStatusFieldShouldExistInOldVersionNodes() throws Throwable .nodes(3) .nodesToUpgradeOrdered(1, 2, 3) // all upgrades from v30 up, excluding v30->v3X and from v40 - .singleUpgrade(v30) - .singleUpgrade(v3X) + .singleUpgradeToCurrentFrom(v30) + .singleUpgradeToCurrentFrom(v3X) .setup(c -> {}) .runAfterNodeUpgrade((cluster, node) -> { if (node == 1) { @@ -87,8 +86,8 @@ public void testStatusFieldShouldExistInOldVersionNodesEdgeCase() throws Throwab .nodes(3) .nodesToUpgradeOrdered(1, 2, 3) // all upgrades from v30 up, excluding v30->v3X and from v40 - .singleUpgrade(v30) - .singleUpgrade(v3X) + .singleUpgradeToCurrentFrom(v30) + .singleUpgradeToCurrentFrom(v3X) .setup(cluster -> { // node2 and node3 gossiper cannot talk with each other cluster.filters().verbs(Verb.GOSSIP_DIGEST_SYN.id).from(2).to(3).drop(); diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeMessageForwardTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeMessageForwardTest.java index 935cc8e0b66a..c2c4b88ba0a2 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeMessageForwardTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeMessageForwardTest.java @@ -82,7 +82,7 @@ public void checkWritesForwardedToOtherDcTest() throws Throwable .withConfig(c -> c.with(Feature.GOSSIP, Feature.NETWORK).set("request_timeout_in_ms", 30000)) .withBuilder(b -> b.withRacks(numDCs, 1, nodesPerDc)) .nodes(numDCs * nodesPerDc) - .singleUpgrade(v30) + .singleUpgradeToCurrentFrom(v30) .setup(cluster -> { cluster.schemaChange("ALTER KEYSPACE " + KEYSPACE + " WITH replication = {'class': 'NetworkTopologyStrategy', " + ntsArgs + " };"); diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeReadRepairDeleteTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeReadRepairDeleteTest.java index 01955c5672b4..e60377853c34 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeReadRepairDeleteTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeReadRepairDeleteTest.java @@ -46,7 +46,8 @@ public void mixedModeReadRepairDeleteRow() throws Throwable allUpgrades(2, 1) .setup(cluster -> { cluster.schemaChange(withKeyspace("CREATE TABLE %s.t (k int, c int, v int, s int static, PRIMARY KEY (k, c))")); - + }) + .runBeforeClusterUpgrade(cluster -> { // insert the rows in all the nodes String insert = withKeyspace("INSERT INTO %s.t (k, c, v, s) VALUES (?, ?, ?, ?)"); cluster.coordinator(1).execute(insert, ConsistencyLevel.ALL, row1); @@ -85,7 +86,8 @@ public void mixedModeReadRepairDeletePartition() throws Throwable allUpgrades(2, 1) .setup(cluster -> { cluster.schemaChange(withKeyspace("CREATE TABLE %s.t (k int, c int, v int, s int static, PRIMARY KEY (k, c))")); - + }) + .runBeforeClusterUpgrade(cluster -> { // insert half partition in each node String insert = withKeyspace("INSERT INTO %s.t (k, c, v, s) VALUES (?, ?, ?, ?)"); cluster.coordinator(1).execute(insert, ConsistencyLevel.ALL, partition1[0]); diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeReadRepairWriteTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeReadRepairWriteTest.java index fcb04824c2fd..4966e5c1a4d6 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeReadRepairWriteTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeReadRepairWriteTest.java @@ -45,6 +45,7 @@ public void mixedModeReadRepairInsert() throws Throwable allUpgrades(2, 1) .setup(c -> c.schemaChange(withKeyspace("CREATE TABLE %s.t (k int, c int, v int, PRIMARY KEY (k, c))"))) + .runBeforeClusterUpgrade(cluster -> cluster.coordinator(1).execute(withKeyspace("TRUNCATE %s.t"), ConsistencyLevel.ALL)) .runAfterClusterUpgrade(cluster -> { // insert rows internally in each node @@ -77,7 +78,8 @@ public void mixedModeReadRepairUpdate() throws Throwable allUpgrades(2, 1) .setup(cluster -> { cluster.schemaChange(withKeyspace("CREATE TABLE %s.t (k int, c int, v int, PRIMARY KEY (k, c))")); - + }) + .runBeforeClusterUpgrade(cluster -> { // insert the initial version of the rows in all the nodes String insert = withKeyspace("INSERT INTO %s.t (k, c, v) VALUES (?, ?, ?)"); cluster.coordinator(1).execute(insert, ConsistencyLevel.ALL, row1); diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeReadTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeReadTest.java index b11678dc55cf..a03967829320 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeReadTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeReadTest.java @@ -22,7 +22,6 @@ import org.apache.cassandra.distributed.api.Feature; import org.apache.cassandra.distributed.api.IInvokableInstance; -import org.apache.cassandra.distributed.shared.Versions; import org.apache.cassandra.gms.Gossiper; import org.apache.cassandra.utils.CassandraVersion; @@ -40,8 +39,8 @@ public void mixedModeReadColumnSubsetDigestCheck() throws Throwable .nodes(2) .nodesToUpgrade(1) // all upgrades from v30 up, excluding v30->v3X and from v40 - .singleUpgrade(v30) - .singleUpgrade(v3X) + .singleUpgradeToCurrentFrom(v30) + .singleUpgradeToCurrentFrom(v3X) .setup(cluster -> { cluster.schemaChange(CREATE_TABLE); insertData(cluster.coordinator(1)); diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeRepairTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeRepairTest.java index 813d9f20b546..6606bcde7e5e 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeRepairTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeRepairTest.java @@ -54,7 +54,7 @@ public void testRepairDuringMajorUpgrade() throws Throwable new UpgradeTestBase.TestCase() .nodes(2) .nodesToUpgrade(UPGRADED_NODE) - .singleUpgrade(v3X) + .singleUpgradeToCurrentFrom(v3X) .withConfig(config -> config.with(NETWORK, GOSSIP)) .setup(cluster -> { cluster.schemaChange(CREATE_TABLE); diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeReplicationTestBase.java b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeReplicationTestBase.java deleted file mode 100644 index 3f2da7aab5c0..000000000000 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeReplicationTestBase.java +++ /dev/null @@ -1,89 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -package org.apache.cassandra.distributed.upgrade; - -import java.util.ArrayList; -import java.util.List; - -import com.vdurmont.semver4j.Semver; - -import org.apache.cassandra.distributed.api.ConsistencyLevel; - -import static org.apache.cassandra.distributed.shared.AssertUtils.assertRows; -import static org.apache.cassandra.distributed.shared.AssertUtils.row; - -/** - * A base class for testing basic replication on mixed-version clusters. - */ -public class MixedModeReplicationTestBase extends UpgradeTestBase -{ - protected void testSimpleStrategy(Semver from) throws Throwable - { - testSimpleStrategy(from, UpgradeTestBase.CURRENT); - } - - protected void testSimpleStrategy(Semver from, Semver to) throws Throwable - { - String insert = "INSERT INTO test_simple.names (key, name) VALUES (?, ?)"; - String select = "SELECT * FROM test_simple.names WHERE key = ?"; - - new TestCase() - .nodes(3) - .nodesToUpgrade(1, 2) - .upgrades(from, to) - .setup(cluster -> { - cluster.schemaChange("CREATE KEYSPACE test_simple WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 2};"); - cluster.schemaChange("CREATE TABLE test_simple.names (key int PRIMARY KEY, name text)"); - }) - .runAfterNodeUpgrade((cluster, upgraded) -> { - List initialTokens = new ArrayList<>(cluster.size() + 1); - initialTokens.add(null); // The first valid token is at 1 to avoid offset math below. - - for (int i = 1; i <= cluster.size(); i++) - initialTokens.add(Long.valueOf(cluster.get(i).config().get("initial_token").toString())); - - List validTokens = initialTokens.subList(1, cluster.size() + 1); - - // Exercise all the coordinators... - for (int i = 1; i <= cluster.size(); i++) - { - // ...and sample enough keys that we cover the ring. - for (int j = 0; j < 10; j++) - { - int key = j + (i * 10); - Object[] row = row(key, "Nero"); - Long token = tokenFrom(key); - - cluster.coordinator(i).execute(insert, ConsistencyLevel.ALL, row); - - int node = primaryReplica(validTokens, token); - assertRows(cluster.get(node).executeInternal(select, key), row); - - node = nextNode(node, cluster.size()); - assertRows(cluster.get(node).executeInternal(select, key), row); - - // At RF=2, this node should not have received the write. - node = nextNode(node, cluster.size()); - assertRows(cluster.get(node).executeInternal(select, key)); - } - } - }) - .run(); - } -} diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/Pre40MessageFilterTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/Pre40MessageFilterTest.java index 4cca7b9aba3f..59e624ddb382 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/Pre40MessageFilterTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/Pre40MessageFilterTest.java @@ -35,8 +35,8 @@ public void reserializePre40RequestPaxosTest(Consumer configCon .withConfig(configConsumer) .nodesToUpgrade(1) // all upgrades from v30 up, excluding v30->v3X - .singleUpgrade(v30) - .upgradesFrom(v3X) + .singleUpgradeToCurrentFrom(v30) + .upgradesToCurrentFrom(v3X) .setup((cluster) -> { cluster.filters().outbound().allVerbs().messagesMatching((f,t,m) -> false).drop(); cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck))"); diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/UpgradeTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/UpgradeTest.java index 691d8af94f7e..55e1f1ea9441 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/UpgradeTest.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/UpgradeTest.java @@ -22,7 +22,7 @@ import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.distributed.api.Feature; -import org.apache.cassandra.distributed.shared.Versions; + import static org.apache.cassandra.distributed.shared.AssertUtils.*; public class UpgradeTest extends UpgradeTestBase @@ -34,7 +34,7 @@ public void simpleUpgradeWithNetworkAndGossipTest() throws Throwable .nodes(2) .nodesToUpgrade(1) .withConfig((cfg) -> cfg.with(Feature.NETWORK, Feature.GOSSIP)) - .upgradesFrom(v3X) + .upgradesToCurrentFrom(v3X) .setup((cluster) -> { cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck))"); cluster.coordinator(1).execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (1, 1, 1)", ConsistencyLevel.ALL); diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/UpgradeTestBase.java b/test/distributed/org/apache/cassandra/distributed/upgrade/UpgradeTestBase.java index e530dbdc4505..e41444fe529b 100644 --- a/test/distributed/org/apache/cassandra/distributed/upgrade/UpgradeTestBase.java +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/UpgradeTestBase.java @@ -19,17 +19,22 @@ package org.apache.cassandra.distributed.upgrade; import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collections; import java.util.HashSet; import java.util.LinkedHashSet; import java.util.List; +import java.util.NavigableSet; +import java.util.Objects; import java.util.Set; import java.util.function.Consumer; +import java.util.stream.Collectors; -import com.google.common.collect.ImmutableList; import com.vdurmont.semver4j.Semver; import com.vdurmont.semver4j.Semver.SemverType; import org.junit.After; +import org.junit.Assume; import org.junit.BeforeClass; import org.slf4j.Logger; @@ -44,10 +49,11 @@ import org.apache.cassandra.distributed.shared.ThrowingRunnable; import org.apache.cassandra.distributed.shared.Versions; import org.apache.cassandra.utils.ByteBufferUtil; -import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.SimpleGraph; import static org.apache.cassandra.distributed.shared.Versions.Version; import static org.apache.cassandra.distributed.shared.Versions.find; +import static org.apache.cassandra.utils.SimpleGraph.sortedVertices; public class UpgradeTestBase extends DistributedTestBase { @@ -56,8 +62,7 @@ public class UpgradeTestBase extends DistributedTestBase @After public void afterEach() { - System.runFinalization(); - System.gc(); + triggerGC(); } @BeforeClass @@ -88,30 +93,58 @@ public static interface RunOnClusterAndNode public static final Semver v41 = new Semver("4.1-alpha1", SemverType.LOOSE); public static final Semver v42 = new Semver("4.2-alpha1", SemverType.LOOSE); - protected static final List> SUPPORTED_UPGRADE_PATHS = ImmutableList.of( - Pair.create(v30, v3X), - Pair.create(v30, v40), - Pair.create(v30, v41), - Pair.create(v30, v42), - Pair.create(v3X, v40), - Pair.create(v3X, v41), - Pair.create(v3X, v42), - Pair.create(v40, v41), - Pair.create(v40, v42), - Pair.create(v41, v42)); + protected static final SimpleGraph SUPPORTED_UPGRADE_PATHS = new SimpleGraph.Builder() + .addEdge(v30, v3X) + .addEdge(v30, v40) + .addEdge(v30, v41) + .addEdge(v30, v42) + .addEdge(v3X, v40) + .addEdge(v3X, v41) + .addEdge(v3X, v42) + .addEdge(v40, v41) + .addEdge(v40, v42) + .addEdge(v41, v42) + .build(); // the last is always the current - public static final Semver CURRENT = SUPPORTED_UPGRADE_PATHS.get(SUPPORTED_UPGRADE_PATHS.size() - 1).right; + public static final Semver CURRENT = SimpleGraph.max(SUPPORTED_UPGRADE_PATHS); + public static final Semver OLDEST = SimpleGraph.min(SUPPORTED_UPGRADE_PATHS); public static class TestVersions { final Version initial; - final Version upgrade; + final List upgrade; + final List upgradeVersions; - public TestVersions(Version initial, Version upgrade) + public TestVersions(Version initial, List upgrade) { this.initial = initial; this.upgrade = upgrade; + this.upgradeVersions = upgrade.stream().map(v -> v.version).collect(Collectors.toList()); + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + TestVersions that = (TestVersions) o; + return Objects.equals(initial.version, that.initial.version) && Objects.equals(upgradeVersions, that.upgradeVersions); + } + + @Override + public int hashCode() + { + return Objects.hash(initial.version, upgradeVersions); + } + + @Override + public String toString() + { + StringBuilder sb = new StringBuilder(); + sb.append(initial.version).append(" -> "); + sb.append(upgradeVersions); + return sb.toString(); } } @@ -123,6 +156,7 @@ public static class TestCase implements ThrowingRunnable private RunOnCluster setup; private RunOnClusterAndNode runBeforeNodeRestart; private RunOnClusterAndNode runAfterNodeUpgrade; + private RunOnCluster runBeforeClusterUpgrade; private RunOnCluster runAfterClusterUpgrade; private final Set nodesToUpgrade = new LinkedHashSet<>(); private Consumer configConsumer; @@ -145,28 +179,77 @@ public TestCase nodes(int nodeCount) } /** performs all supported upgrade paths that exist in between from and CURRENT (inclusive) **/ - public TestCase upgradesFrom(Semver from) + public TestCase upgradesToCurrentFrom(Semver from) + { + return upgradesTo(from, CURRENT); + } + + /** + * performs all supported upgrade paths to the "to" target; example + * {@code upgradesTo(3.0, 4.0); // produces: 3.0 -> 4.0, 3.11 -> 4.0} + */ + public TestCase upgradesTo(Semver from, Semver to) + { + List upgrade = new ArrayList<>(); + NavigableSet vertices = sortedVertices(SUPPORTED_UPGRADE_PATHS); + for (Semver start : vertices.subSet(from, true, to, false)) + { + // only include pairs that are allowed + if (SUPPORTED_UPGRADE_PATHS.hasEdge(start, to)) + upgrade.add(new TestVersions(versions.getLatest(start), Collections.singletonList(versions.getLatest(to)))); + } + logger.info("Adding upgrades of\n{}", upgrade.stream().map(TestVersions::toString).collect(Collectors.joining("\n"))); + this.upgrade.addAll(upgrade); + return this; + } + + /** + * performs all supported upgrade paths from the "from" target; example + * {@code upgradesFrom(4.0, 4.2); // produces: 4.0 -> 4.1, 4.0 -> 4.2} + */ + public TestCase upgradesFrom(Semver from, Semver to) { - return upgrades(from, CURRENT); + List upgrade = new ArrayList<>(); + NavigableSet vertices = sortedVertices(SUPPORTED_UPGRADE_PATHS); + for (Semver end : vertices.subSet(from, false, to, true)) + { + // only include pairs that are allowed + if (SUPPORTED_UPGRADE_PATHS.hasEdge(from, end)) + upgrade.add(new TestVersions(versions.getLatest(from), Collections.singletonList(versions.getLatest(end)))); + } + logger.info("Adding upgrades of\n{}", upgrade.stream().map(TestVersions::toString).collect(Collectors.joining("\n"))); + this.upgrade.addAll(upgrade); + return this; } - /** performs all supported upgrade paths that exist in between from and to (inclusive) **/ + /** + * performs all supported upgrade paths that exist in between from and to that include the current version. + * This call is equivilent to calling {@code upgradesTo(from, CURRENT).upgradesFrom(CURRENT, to)}. + **/ public TestCase upgrades(Semver from, Semver to) { - SUPPORTED_UPGRADE_PATHS.stream() - .filter(upgradePath -> (upgradePath.left.compareTo(from) >= 0 && upgradePath.right.compareTo(to) <= 0)) - .forEachOrdered(upgradePath -> - { - this.upgrade.add( - new TestVersions(versions.getLatest(upgradePath.left), versions.getLatest(upgradePath.right))); - }); + Assume.assumeTrue("Unable to do upgrades(" + from + ", " + to + "); does not contain CURRENT=" + CURRENT, contains(from, to, CURRENT)); + if (from.compareTo(CURRENT) < 0) + upgradesTo(from, CURRENT); + if (CURRENT.compareTo(to) < 0) + upgradesFrom(CURRENT, to); return this; } + private static boolean contains(Semver from, Semver to, Semver target) + { + // target >= from && target <= to + return target.compareTo(from) >= 0 && target.compareTo(to) <= 0; + } + /** Will test this specific upgrade path **/ - public TestCase singleUpgrade(Semver from) + public TestCase singleUpgradeToCurrentFrom(Semver from) { - this.upgrade.add(new TestVersions(versions.getLatest(from), versions.getLatest(CURRENT))); + if (!SUPPORTED_UPGRADE_PATHS.hasEdge(from, CURRENT)) + throw new AssertionError("Upgrading from " + from + " to " + CURRENT + " isn't directly supported and must go through other versions first; supported paths: " + SUPPORTED_UPGRADE_PATHS.findPaths(from, CURRENT)); + TestVersions tests = new TestVersions(this.versions.getLatest(from), Arrays.asList(this.versions.getLatest(CURRENT))); + logger.info("Adding upgrade of {}", tests); + this.upgrade.add(tests); return this; } @@ -188,6 +271,12 @@ public TestCase runAfterNodeUpgrade(RunOnClusterAndNode runAfterNodeUpgrade) return this; } + public TestCase runBeforeClusterUpgrade(RunOnCluster runBeforeClusterUpgrade) + { + this.runBeforeClusterUpgrade = runBeforeClusterUpgrade; + return this; + } + public TestCase runAfterClusterUpgrade(RunOnCluster runAfterClusterUpgrade) { this.runAfterClusterUpgrade = runAfterClusterUpgrade; @@ -216,6 +305,8 @@ public void run() throws Throwable throw new AssertionError(); if (runBeforeNodeRestart == null) runBeforeNodeRestart = (c, n) -> {}; + if (runBeforeClusterUpgrade == null) + runBeforeClusterUpgrade = (c) -> {}; if (runAfterClusterUpgrade == null) runAfterClusterUpgrade = (c) -> {}; if (runAfterNodeUpgrade == null) @@ -224,26 +315,44 @@ public void run() throws Throwable for (int n = 1; n <= nodeCount; n++) nodesToUpgrade.add(n); + int offset = 0; for (TestVersions upgrade : this.upgrade) { - logger.info("testing upgrade from {} to {}", upgrade.initial.version, upgrade.upgrade.version); + logger.info("testing upgrade from {} to {}", upgrade.initial.version, upgrade.upgradeVersions); try (UpgradeableCluster cluster = init(UpgradeableCluster.create(nodeCount, upgrade.initial, configConsumer, builderConsumer))) { setup.run(cluster); - for (int n : nodesToUpgrade) + for (Version nextVersion : upgrade.upgrade) { - cluster.get(n).shutdown().get(); - cluster.get(n).setVersion(upgrade.upgrade); - runBeforeNodeRestart.run(cluster, n); - cluster.get(n).startup(); - runAfterNodeUpgrade.run(cluster, n); + try + { + runBeforeClusterUpgrade.run(cluster); + + for (int n : nodesToUpgrade) + { + cluster.get(n).shutdown().get(); + triggerGC(); + cluster.get(n).setVersion(nextVersion); + runBeforeNodeRestart.run(cluster, n); + cluster.get(n).startup(); + runAfterNodeUpgrade.run(cluster, n); + } + + runAfterClusterUpgrade.run(cluster); + + cluster.checkAndResetUncaughtExceptions(); + } + catch (Throwable t) + { + throw new AssertionError(String.format("Error in test '%s' while upgrading to '%s'; successful upgrades %s", upgrade, nextVersion.version, this.upgrade.stream().limit(offset).collect(Collectors.toList())), t); + } } - - runAfterClusterUpgrade.run(cluster); } + offset++; } } + public TestCase nodesToUpgrade(int ... nodes) { Set set = new HashSet<>(nodes.length); @@ -265,10 +374,16 @@ public TestCase nodesToUpgradeOrdered(int ... nodes) } } + private static void triggerGC() + { + System.runFinalization(); + System.gc(); + } + protected TestCase allUpgrades(int nodes, int... toUpgrade) { return new TestCase().nodes(nodes) - .upgradesFrom(v30) + .upgradesToCurrentFrom(v30) .nodesToUpgrade(toUpgrade); } diff --git a/test/unit/org/apache/cassandra/utils/SimpleGraph.java b/test/unit/org/apache/cassandra/utils/SimpleGraph.java new file mode 100644 index 000000000000..71b1fb251504 --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/SimpleGraph.java @@ -0,0 +1,126 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.utils; + +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.NavigableSet; +import java.util.Set; +import java.util.TreeSet; +import java.util.function.Consumer; + +import com.google.common.collect.ImmutableMap; +import com.google.common.collect.ImmutableSet; +import com.google.common.collect.Ordering; + +/** + * A directed graph. Main usage is the {@link #findPaths(Object, Object)} method which is used to find all paths between + * 2 vertices. + */ +public class SimpleGraph +{ + private final ImmutableMap> edges; + + private SimpleGraph(ImmutableMap> edges) + { + if (edges == null || edges.isEmpty()) + throw new AssertionError("Edges empty"); + this.edges = edges; + } + + public static > NavigableSet sortedVertices(SimpleGraph graph) + { + return new TreeSet<>(graph.vertices()); + } + + public static > T min(SimpleGraph graph) + { + return Ordering.natural().min(graph.vertices()); + } + + public static > T max(SimpleGraph graph) + { + return Ordering.natural().max(graph.vertices()); + } + + public boolean hasEdge(V a, V b) + { + ImmutableSet matches = edges.get(a); + return matches != null && matches.contains(b); + } + + public ImmutableSet vertices() + { + ImmutableSet.Builder b = ImmutableSet.builder(); + b.addAll(edges.keySet()); + edges.values().forEach(b::addAll); + return b.build(); + } + + public List> findPaths(V from, V to) + { + List> matches = new ArrayList<>(); + findPaths0(Collections.singletonList(from), from, to, matches::add); + return matches; + } + + private void findPaths0(List accum, V from, V to, Consumer> onMatch) + { + ImmutableSet check = edges.get(from); + if (check == null) + return; // no matches + for (V next : check) + { + if (accum.contains(next)) + return; // ignore walking recursive + List nextAccum = new ArrayList<>(accum); + nextAccum.add(next); + if (next.equals(to)) + { + onMatch.accept(nextAccum); + } + else + { + findPaths0(nextAccum, next, to, onMatch); + } + } + } + + public static class Builder + { + private final Map> edges = new HashMap<>(); + + public Builder addEdge(V from, V to) + { + edges.computeIfAbsent(from, ignore -> new HashSet<>()).add(to); + return this; + } + + public SimpleGraph build() + { + ImmutableMap.Builder> builder = ImmutableMap.builder(); + for (Map.Entry> e : edges.entrySet()) + builder.put(e.getKey(), ImmutableSet.copyOf(e.getValue())); + return new SimpleGraph(builder.build()); + } + } +} diff --git a/test/unit/org/apache/cassandra/utils/SimpleGraphTest.java b/test/unit/org/apache/cassandra/utils/SimpleGraphTest.java new file mode 100644 index 000000000000..6adee36889ff --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/SimpleGraphTest.java @@ -0,0 +1,108 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.utils; + +import java.util.Arrays; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +import org.junit.Test; + +import org.assertj.core.api.Assertions; + +import static org.assertj.core.api.Assertions.assertThat; + +public class SimpleGraphTest +{ + @Test + public void empty() + { + Assertions.assertThatThrownBy(() -> new SimpleGraph.Builder().build()) + .isInstanceOf(AssertionError.class) + .hasMessage("Edges empty"); + } + + /** + * If vertices have edges that form a circle this should not cause {@link SimpleGraph#findPaths(Object, Object)} to + * hang. + */ + @Test + public void recursive() + { + SimpleGraph graph = of("A", "B", + "B", "C", + "C", "A"); + // no paths to identity + assertThat(graph.findPaths("A", "A")).isEmpty(); + assertThat(graph.findPaths("B", "B")).isEmpty(); + assertThat(graph.findPaths("C", "C")).isEmpty(); + + assertThat(graph.findPaths("C", "B")).isEqualTo(Collections.singletonList(Arrays.asList("C", "A", "B"))); + + // all options return and don't have duplicate keys + for (String i : graph.vertices()) + { + for (String j : graph.vertices()) + { + List> paths = graph.findPaths(i, j); + for (List path : paths) + { + Map distinct = countDistinct(path); + for (Map.Entry e : distinct.entrySet()) + assertThat(e.getValue()).describedAs("Duplicate vertex %s found; %s", e.getKey(), path).isEqualTo(1); + } + } + } + } + + @Test + public void simple() + { + SimpleGraph graph = of("A", "B", + "B", "C", + "C", "D"); + + assertThat(graph.findPaths("A", "B")).isEqualTo(Collections.singletonList(Arrays.asList("A", "B"))); + assertThat(graph.findPaths("A", "C")).isEqualTo(Collections.singletonList(Arrays.asList("A", "B", "C"))); + assertThat(graph.findPaths("B", "D")).isEqualTo(Collections.singletonList(Arrays.asList("B", "C", "D"))); + + assertThat(graph.hasEdge("A", "B")).isTrue(); + assertThat(graph.hasEdge("C", "D")).isTrue(); + assertThat(graph.hasEdge("B", "A")).isFalse(); + assertThat(graph.hasEdge("C", "B")).isFalse(); + } + + private static Map countDistinct(List list) + { + Map map = new HashMap<>(); + for (T t : list) + map.compute(t, (ignore, accum) -> accum == null ? 1 : accum + 1); + return map; + } + + static SimpleGraph of(T... values) + { + assert values.length % 2 == 0: "graph requires even number of values, but given " + values.length; + SimpleGraph.Builder builder = new SimpleGraph.Builder<>(); + for (int i = 0; i < values.length; i = i + 2) + builder.addEdge(values[i], values[i + 1]); + return builder.build(); + } +} \ No newline at end of file From 99d034a2245c44becb6a730c77ad51ab9340f3a7 Mon Sep 17 00:00:00 2001 From: Yifan Cai Date: Mon, 6 Jun 2022 13:15:33 -0700 Subject: [PATCH 014/159] Option to disable CDC on SSTable repair patch by Yifan Cai; reviewed by Josh McKenzie for CASSANDRA-17666 --- CHANGES.txt | 1 + NEWS.txt | 6 ++ conf/cassandra.yaml | 12 +++ .../org/apache/cassandra/config/Config.java | 3 + .../cassandra/config/DatabaseDescriptor.java | 10 ++ .../cassandra/db/commitlog/CommitLog.java | 29 +++++- .../db/commitlog/CommitLogMBean.java | 6 ++ .../db/streaming/CassandraStreamReceiver.java | 25 +++-- .../cdc/ToggleCDCOnRepairEnabledTest.java | 97 +++++++++++++++++++ 9 files changed, 175 insertions(+), 14 deletions(-) create mode 100644 test/distributed/org/apache/cassandra/distributed/test/cdc/ToggleCDCOnRepairEnabledTest.java diff --git a/CHANGES.txt b/CHANGES.txt index d6b4ff5ab97e..9e31bd96e634 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Option to disable CDC writes of repaired data (CASSANDRA-17666) * When a node is bootstrapping it gets the whole gossip state but applies in random order causing some cases where StorageService will fail causing an instance to not show up in TokenMetadata (CASSANDRA-17676) * Add CQLSH command SHOW REPLICAS (CASSANDRA-17577) * Add guardrail to allow disabling of SimpleStrategy (CASSANDRA-17647) diff --git a/NEWS.txt b/NEWS.txt index 5a52c6e3bae2..996113d7c7eb 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -57,6 +57,12 @@ using the provided 'sstableupgrade' tool. New features ------------ + - Added a new configuration cdc_on_repair_enabled to toggle whether CDC mutations are replayed through the + write path on streaming, e.g. repair. When enabled, CDC data streamed to the destination node will be written into + commit log first. When disabled, the streamed CDC data is written into SSTables just the same as normal streaming. + If this is set to false, streaming will be considerably faster however it's possible that, in extreme situations + (losing > quorum # nodes in a replica set), you may have data in your SSTables that never makes it to the CDC log. + The default is true/enabled. The configuration can be altered via JMX. - Added a new CQL function, maxwritetime. It shows the largest unix timestamp that the data was written, similar to its sibling CQL function, writetime. Unlike writetime, maxwritetime can be applied to multi-cell data types, e.g. non-frozen collections and UDT, and returns the largest timestamp. One should not to use it when upgrading to 4.2. diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml index 491740f012c6..3bab6712c820 100644 --- a/conf/cassandra.yaml +++ b/conf/cassandra.yaml @@ -298,6 +298,18 @@ partitioner: org.apache.cassandra.dht.Murmur3Partitioner # containing a CDC-enabled table if at space limit in cdc_raw_directory). cdc_enabled: false +# Specify whether writes to the CDC-enabled tables should be blocked when CDC data on disk has reached to the limit. +# When setting to false, the writes will not be blocked and the oldest CDC data on disk will be deleted to +# ensure the size constraint. The default is true. +# cdc_block_writes: true + +# Specify whether CDC mutations are replayed through the write path on streaming, e.g. repair. +# When enabled, CDC data streamed to the destination node will be written into commit log first. When setting to false, +# the streamed CDC data is written into SSTables just the same as normal streaming. The default is true. +# If this is set to false, streaming will be considerably faster however it's possible that, in extreme situations +# (losing > quorum # nodes in a replica set), you may have data in your SSTables that never makes it to the CDC log. +# cdc_on_repair_enabled: true + # CommitLogSegments are moved to this directory on flush if cdc_enabled: true and the # segment contains mutations for a CDC-enabled table. This should be placed on a # separate spindle than the data directories. If not set, the default directory is diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java index c3c5b3582cb4..3d2dbb7b40dc 100644 --- a/src/java/org/apache/cassandra/config/Config.java +++ b/src/java/org/apache/cassandra/config/Config.java @@ -380,6 +380,9 @@ public MemtableOptions() // When true, new CDC mutations are rejected/blocked when reaching max CDC storage. // When false, new CDC mutations can always be added. But it will remove the oldest CDC commit log segment on full. public volatile boolean cdc_block_writes = true; + // When true, CDC data in SSTable go through commit logs during internodes streaming, e.g. repair + // When false, it behaves the same as normal streaming. + public volatile boolean cdc_on_repair_enabled = true; public String cdc_raw_directory; @Replaces(oldName = "cdc_total_space_in_mb", converter = Converters.MEBIBYTES_DATA_STORAGE_INT, deprecated = true) public DataStorageSpec.IntMebibytesBound cdc_total_space = new DataStorageSpec.IntMebibytesBound("0MiB"); diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java index 16b5c4b78df8..8151c968719e 100644 --- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java +++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java @@ -3486,6 +3486,16 @@ public static void setCDCBlockWrites(boolean val) conf.cdc_block_writes = val; } + public static boolean isCDCOnRepairEnabled() + { + return conf.cdc_on_repair_enabled; + } + + public static void setCDCOnRepairEnabled(boolean val) + { + conf.cdc_on_repair_enabled = val; + } + public static String getCDCLogLocation() { return conf.cdc_raw_directory; diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLog.java b/src/java/org/apache/cassandra/db/commitlog/CommitLog.java index 37df1f9451d5..87426122fe76 100644 --- a/src/java/org/apache/cassandra/db/commitlog/CommitLog.java +++ b/src/java/org/apache/cassandra/db/commitlog/CommitLog.java @@ -429,11 +429,7 @@ public boolean getCDCBlockWrites() @Override public void setCDCBlockWrites(boolean val) { - Preconditions.checkState(DatabaseDescriptor.isCDCEnabled(), - "Unable to set block_writes (%s): CDC is not enabled.", val); - Preconditions.checkState(segmentManager instanceof CommitLogSegmentManagerCDC, - "CDC is enabled but we have the wrong CommitLogSegmentManager type: %s. " + - "Please report this as bug.", segmentManager.getClass().getName()); + ensureCDCEnabled("Unable to set block_writes."); boolean oldVal = DatabaseDescriptor.getCDCBlockWrites(); CommitLogSegment currentSegment = segmentManager.allocatingFrom(); // Update the current segment CDC state to PERMITTED if block_writes is disabled now, and it was in FORBIDDEN state @@ -443,6 +439,29 @@ public void setCDCBlockWrites(boolean val) logger.info("Updated CDC block_writes from {} to {}", oldVal, val); } + + @Override + public boolean isCDCOnRepairEnabled() + { + return DatabaseDescriptor.isCDCOnRepairEnabled(); + } + + @Override + public void setCDCOnRepairEnabled(boolean value) + { + ensureCDCEnabled("Unable to set cdc_on_repair_enabled."); + DatabaseDescriptor.setCDCOnRepairEnabled(value); + logger.info("Set cdc_on_repair_enabled to {}", value); + } + + private void ensureCDCEnabled(String hint) + { + Preconditions.checkState(DatabaseDescriptor.isCDCEnabled(), "CDC is not enabled. %s", hint); + Preconditions.checkState(segmentManager instanceof CommitLogSegmentManagerCDC, + "CDC is enabled but we have the wrong CommitLogSegmentManager type: %s. " + + "Please report this as bug.", segmentManager.getClass().getName()); + } + /** * Shuts down the threads used by the commit log, blocking until completion. * TODO this should accept a timeout, and throw TimeoutException diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLogMBean.java b/src/java/org/apache/cassandra/db/commitlog/CommitLogMBean.java index 7e8deca9b086..189916c66e52 100644 --- a/src/java/org/apache/cassandra/db/commitlog/CommitLogMBean.java +++ b/src/java/org/apache/cassandra/db/commitlog/CommitLogMBean.java @@ -88,4 +88,10 @@ public interface CommitLogMBean public boolean getCDCBlockWrites(); public void setCDCBlockWrites(boolean val); + + /** Returns true if internodes streaming of CDC data should go through write path */ + boolean isCDCOnRepairEnabled(); + + /** Set whether enable write path for CDC data during internodes streaming, e.g. repair */ + void setCDCOnRepairEnabled(boolean value); } diff --git a/src/java/org/apache/cassandra/db/streaming/CassandraStreamReceiver.java b/src/java/org/apache/cassandra/db/streaming/CassandraStreamReceiver.java index 48de8b54fce8..b5963978c0aa 100644 --- a/src/java/org/apache/cassandra/db/streaming/CassandraStreamReceiver.java +++ b/src/java/org/apache/cassandra/db/streaming/CassandraStreamReceiver.java @@ -25,18 +25,16 @@ import com.google.common.base.Preconditions; import com.google.common.collect.Iterables; - -import org.apache.cassandra.db.lifecycle.LifecycleNewTracker; -import org.apache.cassandra.io.sstable.SSTable; - import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.Mutation; import org.apache.cassandra.db.compaction.OperationType; import org.apache.cassandra.db.filter.ColumnFilter; +import org.apache.cassandra.db.lifecycle.LifecycleNewTracker; import org.apache.cassandra.db.lifecycle.LifecycleTransaction; import org.apache.cassandra.db.partitions.PartitionUpdate; import org.apache.cassandra.db.rows.ThrottledUnfilteredIterator; @@ -45,6 +43,7 @@ import org.apache.cassandra.dht.Bounds; import org.apache.cassandra.dht.Token; import org.apache.cassandra.io.sstable.ISSTableScanner; +import org.apache.cassandra.io.sstable.SSTable; import org.apache.cassandra.io.sstable.SSTableMultiWriter; import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.streaming.IncomingStream; @@ -172,23 +171,31 @@ private boolean hasCDC(ColumnFamilyStore cfs) return cfs.metadata().params.cdc; } + // returns true iif it is a cdc table and cdc on repair is enabled. + private boolean cdcRequiresWriteCommitLog(ColumnFamilyStore cfs) + { + return DatabaseDescriptor.isCDCOnRepairEnabled() && hasCDC(cfs); + } + /* * We have a special path for views and for CDC. * * For views, since the view requires cleaning up any pre-existing state, we must put all partitions * through the same write path as normal mutations. This also ensures any 2is are also updated. * - * For CDC-enabled tables, we want to ensure that the mutations are run through the CommitLog so they - * can be archived by the CDC process on discard. + * For CDC-enabled tables and write path for CDC is enabled, we want to ensure that the mutations are + * run through the CommitLog, so they can be archived by the CDC process on discard. */ private boolean requiresWritePath(ColumnFamilyStore cfs) { - return hasCDC(cfs) || cfs.streamToMemtable() || (session.streamOperation().requiresViewBuild() && hasViews(cfs)); + return cdcRequiresWriteCommitLog(cfs) + || cfs.streamToMemtable() + || (session.streamOperation().requiresViewBuild() && hasViews(cfs)); } private void sendThroughWritePath(ColumnFamilyStore cfs, Collection readers) { - boolean hasCdc = hasCDC(cfs); + boolean writeCDCCommitLog = cdcRequiresWriteCommitLog(cfs); ColumnFilter filter = ColumnFilter.all(cfs.metadata()); for (SSTableReader reader : readers) { @@ -206,7 +213,7 @@ private void sendThroughWritePath(ColumnFamilyStore cfs, Collection { + cluster.get(2).runOnInstance(() -> { + boolean containCDCInLog = CommitLog.instance.segmentManager + .getActiveSegments() + .stream() + .anyMatch(s -> s.getCDCState() == CommitLogSegment.CDCState.CONTAINS); + assertTrue("Mutation should be added to commit log when cdc_on_repair_enabled is true", + containCDCInLog); + }); + }); + } + + @Test + public void testCDCOnRepairIsDisabled() throws Exception + { + testCDCOnRepairEnabled(false, cluster -> { + cluster.get(2).runOnInstance(() -> { + boolean containCDCInLog = CommitLog.instance.segmentManager + .getActiveSegments() + .stream() + .allMatch(s -> s.getCDCState() != CommitLogSegment.CDCState.CONTAINS); + assertTrue("No mutation should be added to commit log when cdc_on_repair_enabled is false", + containCDCInLog); + }); + }); + } + + // test helper to repair data between nodes when cdc_on_repair_enabled is on or off. + private void testCDCOnRepairEnabled(boolean enabled, Consumer assertion) throws Exception + { + try (Cluster cluster = init(Cluster.build(2) + .withConfig(c -> c.set("cdc_enabled", true) + .set("cdc_on_repair_enabled", enabled) + .with(Feature.NETWORK) + .with(Feature.GOSSIP)) + .start())) + { + cluster.schemaChange(withKeyspace("CREATE TABLE %s.tbl (k INT PRIMARY KEY, v INT) WITH cdc=true")); + + // Data only in node1 + cluster.get(1).executeInternal(withKeyspace("INSERT INTO %s.tbl (k, v) VALUES (1, 1)")); + Object[][] result = cluster.get(1).executeInternal(withKeyspace("SELECT * FROM %s.tbl WHERE k = 1")); + assertRows(result, row(1, 1)); + result = cluster.get(2).executeInternal(withKeyspace("SELECT * FROM %s.tbl WHERE k = 1")); + assertRows(result); + + // repair + cluster.get(1).flush(KEYSPACE); + cluster.get(2).nodetool("repair", KEYSPACE, "tbl"); + + // verify node2 now have data + result = cluster.get(2).executeInternal(withKeyspace("SELECT * FROM %s.tbl WHERE k = 1")); + assertRows(result, row(1, 1)); + + assertion.accept(cluster); + } + } +} From 557b8e9982ad0964191abde810ef5c77a536f70a Mon Sep 17 00:00:00 2001 From: Jyothsna Konisa Date: Mon, 13 Jun 2022 11:05:22 -0700 Subject: [PATCH 015/159] Adding support to perform certificate based internode authentication patch by Jyothsna Konisa; reviewed by Jon Meredith, Yifan Cai for CASSANDRA-17661 --- CHANGES.txt | 1 + .../auth/AllowAllInternodeAuthenticator.java | 4 +- .../auth/IInternodeAuthenticator.java | 50 ++- .../net/InboundConnectionInitiator.java | 100 ++++-- .../net/InternodeConnectionUtils.java | 83 +++++ .../cassandra/net/MessagingService.java | 3 + .../net/OutboundConnectionInitiator.java | 60 +++- .../cassandra/service/StorageService.java | 2 +- .../InternodeEncryptionEnforcementTest.java | 286 ++++++++++++++++++ .../org/apache/cassandra/SchemaLoader.java | 1 + .../cassandra/net/MessagingServiceTest.java | 91 ++++-- 11 files changed, 616 insertions(+), 65 deletions(-) create mode 100644 src/java/org/apache/cassandra/net/InternodeConnectionUtils.java diff --git a/CHANGES.txt b/CHANGES.txt index 57733a543884..8e8305aed042 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Adding support to perform certificate based internode authentication (CASSANDRA-17661) * Option to disable CDC writes of repaired data (CASSANDRA-17666) * When a node is bootstrapping it gets the whole gossip state but applies in random order causing some cases where StorageService will fail causing an instance to not show up in TokenMetadata (CASSANDRA-17676) * Add CQLSH command SHOW REPLICAS (CASSANDRA-17577) diff --git a/src/java/org/apache/cassandra/auth/AllowAllInternodeAuthenticator.java b/src/java/org/apache/cassandra/auth/AllowAllInternodeAuthenticator.java index d0d2d745d778..ac62bfae004d 100644 --- a/src/java/org/apache/cassandra/auth/AllowAllInternodeAuthenticator.java +++ b/src/java/org/apache/cassandra/auth/AllowAllInternodeAuthenticator.java @@ -20,12 +20,14 @@ package org.apache.cassandra.auth; import java.net.InetAddress; +import java.security.cert.Certificate; import org.apache.cassandra.exceptions.ConfigurationException; public class AllowAllInternodeAuthenticator implements IInternodeAuthenticator { - public boolean authenticate(InetAddress remoteAddress, int remotePort) + public boolean authenticate(InetAddress remoteAddress, int remotePort, + Certificate[] certificates, InternodeConnectionDirection connectionType) { return true; } diff --git a/src/java/org/apache/cassandra/auth/IInternodeAuthenticator.java b/src/java/org/apache/cassandra/auth/IInternodeAuthenticator.java index 8e09b9035f01..02745fe925b2 100644 --- a/src/java/org/apache/cassandra/auth/IInternodeAuthenticator.java +++ b/src/java/org/apache/cassandra/auth/IInternodeAuthenticator.java @@ -20,6 +20,7 @@ package org.apache.cassandra.auth; import java.net.InetAddress; +import java.security.cert.Certificate; import org.apache.cassandra.exceptions.ConfigurationException; @@ -33,7 +34,35 @@ public interface IInternodeAuthenticator * @param remotePort port of the connecting node. * @return true if the connection should be accepted, false otherwise. */ - boolean authenticate(InetAddress remoteAddress, int remotePort); + @Deprecated + default boolean authenticate(InetAddress remoteAddress, int remotePort) + { + return false; + } + + /** + * Decides whether a peer is allowed to connect to this node. + * If this method returns false, the socket will be immediately closed. + *

+ * Default implementation calls authenticate method by IP and port method + *

+ * 1. If it is IP based authentication ignore the certificates & connectionType parameters in the implementation + * of this method. + * 2. For certificate based authentication like mTLS, server's identity for outbound connections is verified by the + * trusted root certificates in the outbound_keystore. In such cases this method may be overridden to return true + * when certificateType is OUTBOUND, as the authentication of the server happens during SSL Handshake. + * + * @param remoteAddress ip address of the connecting node. + * @param remotePort port of the connecting node. + * @param certificates peer certificates + * @param connectionType If the connection is inbound/outbound connection. + * @return true if the connection should be accepted, false otherwise. + */ + default boolean authenticate(InetAddress remoteAddress, int remotePort, + Certificate[] certificates, InternodeConnectionDirection connectionType) + { + return authenticate(remoteAddress, remotePort); + } /** * Validates configuration of IInternodeAuthenticator implementation (if configurable). @@ -41,4 +70,23 @@ public interface IInternodeAuthenticator * @throws ConfigurationException when there is a configuration error. */ void validateConfiguration() throws ConfigurationException; + + /** + * Setup is called once upon system startup to initialize the IAuthenticator. + * + * For example, use this method to create any required keyspaces/column families. + */ + default void setupInternode() + { + + } + + /** + * Enum that represents connection type of an internode connection. + */ + enum InternodeConnectionDirection + { + INBOUND, + OUTBOUND + } } diff --git a/src/java/org/apache/cassandra/net/InboundConnectionInitiator.java b/src/java/org/apache/cassandra/net/InboundConnectionInitiator.java index c5ed0642594b..f3dc28a307fe 100644 --- a/src/java/org/apache/cassandra/net/InboundConnectionInitiator.java +++ b/src/java/org/apache/cassandra/net/InboundConnectionInitiator.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.net.InetSocketAddress; import java.net.SocketAddress; +import java.security.cert.Certificate; import java.util.List; import java.util.NoSuchElementException; import java.util.concurrent.Future; @@ -46,6 +47,7 @@ import io.netty.handler.logging.LoggingHandler; import io.netty.handler.ssl.SslContext; import io.netty.handler.ssl.SslHandler; +import org.apache.cassandra.auth.IInternodeAuthenticator; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.config.EncryptionOptions; import org.apache.cassandra.exceptions.ConfigurationException; @@ -60,7 +62,11 @@ import static java.lang.Math.*; import static java.util.concurrent.TimeUnit.MILLISECONDS; +import static org.apache.cassandra.auth.IInternodeAuthenticator.InternodeConnectionDirection.INBOUND; import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory; +import static org.apache.cassandra.net.InternodeConnectionUtils.DISCARD_HANDLER_NAME; +import static org.apache.cassandra.net.InternodeConnectionUtils.SSL_HANDLER_NAME; +import static org.apache.cassandra.net.InternodeConnectionUtils.certificates; import static org.apache.cassandra.net.MessagingService.*; import static org.apache.cassandra.net.SocketFactory.WIRETRACE; import static org.apache.cassandra.net.SocketFactory.newSslHandler; @@ -102,7 +108,7 @@ public void initChannel(SocketChannel channel) throws Exception pipelineInjector.accept(pipeline); - // order of handlers: ssl -> logger -> handshakeHandler + // order of handlers: ssl -> client-authentication -> logger -> handshakeHandler // For either unencrypted or transitional modes, allow Ssl optionally. switch(settings.encryption.tlsEncryptionPolicy()) { @@ -111,14 +117,17 @@ public void initChannel(SocketChannel channel) throws Exception pipeline.addAfter(PIPELINE_INTERNODE_ERROR_EXCLUSIONS, "rejectssl", new RejectSslHandler()); break; case OPTIONAL: - pipeline.addAfter(PIPELINE_INTERNODE_ERROR_EXCLUSIONS, "ssl", new OptionalSslHandler(settings.encryption)); + pipeline.addAfter(PIPELINE_INTERNODE_ERROR_EXCLUSIONS, SSL_HANDLER_NAME, new OptionalSslHandler(settings.encryption)); break; case ENCRYPTED: SslHandler sslHandler = getSslHandler("creating", channel, settings.encryption); - pipeline.addAfter(PIPELINE_INTERNODE_ERROR_EXCLUSIONS, "ssl", sslHandler); + pipeline.addAfter(PIPELINE_INTERNODE_ERROR_EXCLUSIONS, SSL_HANDLER_NAME, sslHandler); break; } + // Pipeline for performing client authentication + pipeline.addLast("client-authentication", new ClientAuthenticationHandler(settings.authenticator)); + if (WIRETRACE) pipeline.addLast("logger", new LoggingHandler(LogLevel.INFO)); @@ -198,6 +207,61 @@ public static ChannelFuture bind(InboundConnectionSettings settings, ChannelGrou return bind(new Initializer(settings, channelGroup, pipelineInjector)); } + /** + * Handler to perform authentication for internode inbound connections. + * This handler is called even before messaging handshake starts. + */ + private static class ClientAuthenticationHandler extends ByteToMessageDecoder + { + private final IInternodeAuthenticator authenticator; + + public ClientAuthenticationHandler(IInternodeAuthenticator authenticator) + { + this.authenticator = authenticator; + } + + @Override + protected void decode(ChannelHandlerContext channelHandlerContext, ByteBuf byteBuf, List list) throws Exception + { + // Extract certificates from SSL handler(handler with name "ssl"). + final Certificate[] certificates = certificates(channelHandlerContext.channel()); + if (!authenticate(channelHandlerContext.channel().remoteAddress(), certificates)) + { + logger.error("Unable to authenticate peer {} for internode authentication", channelHandlerContext.channel()); + + // To release all the pending buffered data, replace authentication handler with discard handler. + // This avoids pending inbound data to be fired through the pipeline + channelHandlerContext.pipeline().replace(this, DISCARD_HANDLER_NAME, new InternodeConnectionUtils.ByteBufDiscardHandler()); + channelHandlerContext.pipeline().close(); + } + else + { + channelHandlerContext.pipeline().remove(this); + } + } + + private boolean authenticate(SocketAddress socketAddress, final Certificate[] certificates) throws IOException + { + if (socketAddress.getClass().getSimpleName().equals("EmbeddedSocketAddress")) + return true; + + if (!(socketAddress instanceof InetSocketAddress)) + throw new IOException(String.format("Unexpected SocketAddress type: %s, %s", socketAddress.getClass(), socketAddress)); + + InetSocketAddress addr = (InetSocketAddress) socketAddress; + if (!authenticator.authenticate(addr.getAddress(), addr.getPort(), certificates, INBOUND)) + { + // Log at info level as anything that can reach the inbound port could hit this + // and trigger a log of noise. Failed outbound connections to known cluster endpoints + // still fail with an ERROR message and exception to alert operators that aren't watching logs closely. + logger.info("Authenticate rejected inbound internode connection from {}", addr); + return false; + } + return true; + } + + } + /** * 'Server-side' component that negotiates the internode handshake when establishing a new connection. * This handler will be the first in the netty channel for each incoming connection (secure socket (TLS) notwithstanding), @@ -223,8 +287,7 @@ static class Handler extends ByteToMessageDecoder } /** - * On registration, immediately schedule a timeout to kill this connection if it does not handshake promptly, - * and authenticate the remote address. + * On registration, immediately schedule a timeout to kill this connection if it does not handshake promptly. */ public void handlerAdded(ChannelHandlerContext ctx) throws Exception { @@ -232,31 +295,6 @@ public void handlerAdded(ChannelHandlerContext ctx) throws Exception logger.error("Timeout handshaking with {} (on {})", SocketFactory.addressId(initiate.from, (InetSocketAddress) ctx.channel().remoteAddress()), settings.bindAddress); failHandshake(ctx); }, HandshakeProtocol.TIMEOUT_MILLIS, MILLISECONDS); - - if (!authenticate(ctx.channel().remoteAddress())) - { - failHandshake(ctx); - } - } - - private boolean authenticate(SocketAddress socketAddress) throws IOException - { - if (socketAddress.getClass().getSimpleName().equals("EmbeddedSocketAddress")) - return true; - - if (!(socketAddress instanceof InetSocketAddress)) - throw new IOException(String.format("Unexpected SocketAddress type: %s, %s", socketAddress.getClass(), socketAddress)); - - InetSocketAddress addr = (InetSocketAddress)socketAddress; - if (!settings.authenticate(addr.getAddress(), addr.getPort())) - { - // Log at info level as anything that can reach the inbound port could hit this - // and trigger a log of noise. Failed outbound connections to known cluster endpoints - // still fail with an ERROR message and exception to alert operators that aren't watching logs closely. - logger.info("Authenticate rejected inbound internode connection from {}", addr); - return false; - } - return true; } @Override @@ -562,7 +600,7 @@ protected void decode(ChannelHandlerContext ctx, ByteBuf in, List out) t { // Connection uses SSL/TLS, replace the detection handler with a SslHandler and so use encryption. SslHandler sslHandler = getSslHandler("replacing optional", ctx.channel(), encryptionOptions); - ctx.pipeline().replace(this, "ssl", sslHandler); + ctx.pipeline().replace(this, SSL_HANDLER_NAME, sslHandler); } else { diff --git a/src/java/org/apache/cassandra/net/InternodeConnectionUtils.java b/src/java/org/apache/cassandra/net/InternodeConnectionUtils.java new file mode 100644 index 000000000000..39a087960b31 --- /dev/null +++ b/src/java/org/apache/cassandra/net/InternodeConnectionUtils.java @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.net; + +import java.security.cert.Certificate; +import javax.net.ssl.SSLPeerUnverifiedException; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import io.netty.buffer.ByteBuf; +import io.netty.channel.Channel; +import io.netty.channel.ChannelHandlerContext; +import io.netty.channel.ChannelInboundHandlerAdapter; +import io.netty.handler.ssl.SslHandler; + +/** + * Class that contains certificate utility methods. + */ +class InternodeConnectionUtils +{ + public static String SSL_HANDLER_NAME = "ssl"; + public static String DISCARD_HANDLER_NAME = "discard"; + private static final Logger logger = LoggerFactory.getLogger(InternodeConnectionUtils.class); + + public static Certificate[] certificates(Channel channel) + { + final SslHandler sslHandler = (SslHandler) channel.pipeline().get(SSL_HANDLER_NAME); + Certificate[] certificates = null; + if (sslHandler != null) + { + try + { + certificates = sslHandler.engine() + .getSession() + .getPeerCertificates(); + } + catch (SSLPeerUnverifiedException e) + { + logger.debug("Failed to get peer certificates for peer {}", channel.remoteAddress(), e); + } + } + return certificates; + } + + /** + * Discard handler releases the received data silently. when internode authentication fails, the channel is closed, + * but the pending buffered data may still be fired through the pipeline. To avoid that, authentication handler is + * replaced with this DiscardHandler to release all the buffered data, to avoid handling unauthenticated data in the + * following handlers. + */ + public static class ByteBufDiscardHandler extends ChannelInboundHandlerAdapter + { + @Override + public void channelRead(ChannelHandlerContext ctx, Object msg) + { + if (msg instanceof ByteBuf) + { + ((ByteBuf) msg).release(); + } + else + { + ctx.fireChannelRead(msg); + } + } + } +} diff --git a/src/java/org/apache/cassandra/net/MessagingService.java b/src/java/org/apache/cassandra/net/MessagingService.java index ea019fd8fb64..d968a0ce2b3b 100644 --- a/src/java/org/apache/cassandra/net/MessagingService.java +++ b/src/java/org/apache/cassandra/net/MessagingService.java @@ -475,7 +475,10 @@ public void interruptOutbound(InetAddressAndPort to) { OutboundConnections pool = channelManagers.get(to); if (pool != null) + { pool.interrupt(); + logger.info("Interrupted outbound connections to {}", to); + } } /** diff --git a/src/java/org/apache/cassandra/net/OutboundConnectionInitiator.java b/src/java/org/apache/cassandra/net/OutboundConnectionInitiator.java index a187068ce752..9565f54846c7 100644 --- a/src/java/org/apache/cassandra/net/OutboundConnectionInitiator.java +++ b/src/java/org/apache/cassandra/net/OutboundConnectionInitiator.java @@ -21,13 +21,16 @@ import java.io.IOException; import java.net.InetSocketAddress; import java.nio.channels.ClosedChannelException; +import java.security.cert.Certificate; import java.util.List; import java.util.concurrent.atomic.AtomicBoolean; +import com.google.common.annotations.VisibleForTesting; + import io.netty.util.concurrent.Future; //checkstyle: permit this import import io.netty.util.concurrent.Promise; //checkstyle: permit this import import org.apache.cassandra.utils.concurrent.AsyncPromise; -import org.apache.cassandra.utils.concurrent.ImmediateFuture; + import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -59,6 +62,10 @@ import org.apache.cassandra.utils.memory.BufferPools; import static java.util.concurrent.TimeUnit.*; +import static org.apache.cassandra.auth.IInternodeAuthenticator.InternodeConnectionDirection.OUTBOUND; +import static org.apache.cassandra.net.InternodeConnectionUtils.DISCARD_HANDLER_NAME; +import static org.apache.cassandra.net.InternodeConnectionUtils.SSL_HANDLER_NAME; +import static org.apache.cassandra.net.InternodeConnectionUtils.certificates; import static org.apache.cassandra.net.MessagingService.VERSION_40; import static org.apache.cassandra.net.HandshakeProtocol.*; import static org.apache.cassandra.net.ConnectionType.STREAMING; @@ -130,13 +137,6 @@ private Future> initiate(EventLoop eventLoop) if (logger.isTraceEnabled()) logger.trace("creating outbound bootstrap to {}, requestVersion: {}", settings, requestMessagingVersion); - if (!settings.authenticate()) - { - // interrupt other connections, so they must attempt to re-authenticate - MessagingService.instance().interruptOutbound(settings.to); - return ImmediateFuture.failure(new IOException("authentication failed to " + settings.connectToId())); - } - // this is a bit ugly, but is the easiest way to ensure that if we timeout we can propagate a suitable error message // and still guarantee that, if on timing out we raced with success, the successfully created channel is handled AtomicBoolean timedout = new AtomicBoolean(); @@ -198,7 +198,7 @@ public void initChannel(SocketChannel channel) throws Exception { ChannelPipeline pipeline = channel.pipeline(); - // order of handlers: ssl -> logger -> handshakeHandler + // order of handlers: ssl -> server-authentication -> logger -> handshakeHandler if (settings.withEncryption()) { // check if we should actually encrypt this connection @@ -209,8 +209,9 @@ public void initChannel(SocketChannel channel) throws Exception InetSocketAddress peer = settings.encryption.require_endpoint_verification ? new InetSocketAddress(address.getAddress(), address.getPort()) : null; SslHandler sslHandler = newSslHandler(channel, sslContext, peer); logger.trace("creating outbound netty SslContext: context={}, engine={}", sslContext.getClass().getName(), sslHandler.engine().getClass().getName()); - pipeline.addFirst("ssl", sslHandler); + pipeline.addFirst(SSL_HANDLER_NAME, sslHandler); } + pipeline.addLast("server-authentication", new ServerAuthenticationHandler(settings)); if (WIRETRACE) pipeline.addLast("logger", new LoggingHandler(LogLevel.INFO)); @@ -220,6 +221,45 @@ public void initChannel(SocketChannel channel) throws Exception } + /** + * Authenticates the server before an outbound connection is established. If a connection is SSL based connection + * Server's identity is verified during ssl handshake using root certificate in truststore. One may choose to ignore + * outbound authentication or perform required authentication for outbound connections in the implementation + * of IInternodeAuthenticator interface. + */ + @VisibleForTesting + static class ServerAuthenticationHandler extends ByteToMessageDecoder + { + final OutboundConnectionSettings settings; + + ServerAuthenticationHandler(OutboundConnectionSettings settings) + { + this.settings = settings; + } + + @Override + protected void decode(ChannelHandlerContext channelHandlerContext, ByteBuf byteBuf, List list) throws Exception + { + // Extract certificates from SSL handler(handler with name "ssl"). + final Certificate[] certificates = certificates(channelHandlerContext.channel()); + if (!settings.authenticator.authenticate(settings.to.getAddress(), settings.to.getPort(), certificates, OUTBOUND)) + { + // interrupt other connections, so they must attempt to re-authenticate + MessagingService.instance().interruptOutbound(settings.to); + logger.error("authentication failed to " + settings.connectToId()); + + // To release all the pending buffered data, replace authentication handler with discard handler. + // This avoids pending inbound data to be fired through the pipeline + channelHandlerContext.pipeline().replace(this, DISCARD_HANDLER_NAME, new InternodeConnectionUtils.ByteBufDiscardHandler()); + channelHandlerContext.pipeline().close(); + } + else + { + channelHandlerContext.pipeline().remove(this); + } + } + } + private class Handler extends ByteToMessageDecoder { /** diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java index 34fb6acb1743..0ba43a737386 100644 --- a/src/java/org/apache/cassandra/service/StorageService.java +++ b/src/java/org/apache/cassandra/service/StorageService.java @@ -32,7 +32,6 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.ExecutionException; -import java.util.concurrent.ExecutorService; import java.util.concurrent.ThreadLocalRandom; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; @@ -1238,6 +1237,7 @@ public void doAuthSetup(boolean setUpSchema) DatabaseDescriptor.getRoleManager().setup(); DatabaseDescriptor.getAuthenticator().setup(); + DatabaseDescriptor.getInternodeAuthenticator().setupInternode(); DatabaseDescriptor.getAuthorizer().setup(); DatabaseDescriptor.getNetworkAuthorizer().setup(); AuthCacheService.initializeAndRegisterCaches(); diff --git a/test/distributed/org/apache/cassandra/distributed/test/InternodeEncryptionEnforcementTest.java b/test/distributed/org/apache/cassandra/distributed/test/InternodeEncryptionEnforcementTest.java index 969f37245614..157aede9b7a4 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/InternodeEncryptionEnforcementTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/InternodeEncryptionEnforcementTest.java @@ -17,15 +17,29 @@ */ package org.apache.cassandra.distributed.test; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.net.InetAddress; +import java.security.KeyStore; +import java.security.cert.Certificate; import java.util.HashMap; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicInteger; import com.google.common.collect.ImmutableMap; import org.junit.Test; +import org.apache.cassandra.auth.AllowAllInternodeAuthenticator; +import org.apache.cassandra.auth.IInternodeAuthenticator; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.Feature; import org.apache.cassandra.distributed.api.IIsolatedExecutor.SerializableRunnable; import org.apache.cassandra.distributed.shared.NetworkTopology; +import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.net.InboundMessageHandlers; import org.apache.cassandra.net.MessagingService; import org.apache.cassandra.net.OutboundConnections; @@ -40,6 +54,131 @@ public final class InternodeEncryptionEnforcementTest extends TestBaseImpl { + + @Test + public void testInboundConnectionsAreRejectedWhenAuthFails() throws IOException, TimeoutException + { + Cluster.Builder builder = createCluster(RejectInboundConnections.class); + + final ExecutorService executorService = Executors.newSingleThreadExecutor(); + try (Cluster cluster = builder.start()) + { + executorService.submit(() -> openConnections(cluster)); + + /* + * instance (1) should not connect to instance (2) as authentication fails; + * instance (2) should not connect to instance (1) as authentication fails. + */ + SerializableRunnable runnable = () -> + { + // There should be no inbound handlers as authentication fails and we remove handlers. + assertEquals(0, MessagingService.instance().messageHandlers.values().size()); + + // There should be no outbound connections as authentication fails. + OutboundConnections outbound = getOnlyElement(MessagingService.instance().channelManagers.values()); + assertTrue(!outbound.small.isConnected() && !outbound.large.isConnected() && !outbound.urgent.isConnected()); + + // Verify that the failure is due to authentication failure + final RejectInboundConnections authenticator = (RejectInboundConnections) DatabaseDescriptor.getInternodeAuthenticator(); + assertTrue(authenticator.authenticationFailed); + }; + + // Wait for authentication to fail + cluster.get(1).logs().watchFor("Unable to authenticate peer"); + cluster.get(1).runOnInstance(runnable); + cluster.get(2).logs().watchFor("Unable to authenticate peer"); + cluster.get(2).runOnInstance(runnable); + } + executorService.shutdown(); + } + + @Test + public void testOutboundConnectionsAreRejectedWhenAuthFails() throws IOException, TimeoutException + { + Cluster.Builder builder = createCluster(RejectOutboundAuthenticator.class); + + final ExecutorService executorService = Executors.newSingleThreadExecutor(); + try (Cluster cluster = builder.start()) + { + executorService.submit(() -> openConnections(cluster)); + + /* + * instance (1) should not connect to instance (2) as authentication fails; + * instance (2) should not connect to instance (1) as authentication fails. + */ + SerializableRunnable runnable = () -> + { + // There should be no inbound connections as authentication fails. + InboundMessageHandlers inbound = getOnlyElement(MessagingService.instance().messageHandlers.values()); + assertEquals(0, inbound.count()); + + // There should be no outbound connections as authentication fails. + OutboundConnections outbound = getOnlyElement(MessagingService.instance().channelManagers.values()); + assertTrue(!outbound.small.isConnected() && !outbound.large.isConnected() && !outbound.urgent.isConnected()); + + // Verify that the failure is due to authentication failure + final RejectOutboundAuthenticator authenticator = (RejectOutboundAuthenticator) DatabaseDescriptor.getInternodeAuthenticator(); + assertTrue(authenticator.authenticationFailed); + }; + + // Wait for authentication to fail + cluster.get(1).logs().watchFor("authentication failed"); + cluster.get(1).runOnInstance(runnable); + cluster.get(2).logs().watchFor("authentication failed"); + cluster.get(2).runOnInstance(runnable); + } + executorService.shutdown(); + } + + @Test + public void testOutboundConnectionsAreInterruptedWhenAuthFails() throws IOException, TimeoutException + { + Cluster.Builder builder = createCluster(AllowFirstAndRejectOtherOutboundAuthenticator.class); + try (Cluster cluster = builder.start()) + { + try + { + openConnections(cluster); + } + catch (RuntimeException ise) + { + assertThat(ise.getMessage(), containsString("agreement not reached")); + } + + // Verify that authentication is failed and Interrupt is called on outbound connections. + cluster.get(1).logs().watchFor("authentication failed to"); + cluster.get(1).logs().watchFor("Interrupted outbound connections to"); + + /* + * Check if outbound connections are zero + */ + SerializableRunnable runnable = () -> + { + // Verify that there is only one successful outbound connection + final AllowFirstAndRejectOtherOutboundAuthenticator authenticator = (AllowFirstAndRejectOtherOutboundAuthenticator) DatabaseDescriptor.getInternodeAuthenticator(); + assertEquals(1, authenticator.successfulOutbound.get()); + assertTrue(authenticator.failedOutbound.get() > 0); + + // There should be no outbound connections as authentication fails. + OutboundConnections outbound = getOnlyElement(MessagingService.instance().channelManagers.values()); + assertTrue(!outbound.small.isConnected() && !outbound.large.isConnected() && !outbound.urgent.isConnected()); + }; + cluster.get(1).runOnInstance(runnable); + } + } + + @Test + public void testConnectionsAreAcceptedWhenAuthSucceds() throws IOException + { + verifyAuthenticationSucceeds(AllowAllInternodeAuthenticator.class); + } + + @Test + public void testAuthenticationWithCertificateAuthenticator() throws IOException + { + verifyAuthenticationSucceeds(CertificateVerifyAuthenticator.class); + } + @Test public void testConnectionsAreRejectedWithInvalidConfig() throws Throwable { @@ -155,4 +294,151 @@ private void openConnections(Cluster cluster) cluster.schemaChange("CREATE KEYSPACE test_connections_from_2 " + "WITH replication = {'class': 'SimpleStrategy', 'replication_factor': 2};", false, cluster.get(2)); } + + private void verifyAuthenticationSucceeds(final Class authenticatorClass) throws IOException + { + Cluster.Builder builder = createCluster(authenticatorClass); + try (Cluster cluster = builder.start()) + { + openConnections(cluster); + + /* + * instance (1) should connect to instance (2) without any issues; + * instance (2) should connect to instance (1) without any issues. + */ + + SerializableRunnable runnable = () -> + { + // There should be inbound connections as authentication succeeds. + InboundMessageHandlers inbound = getOnlyElement(MessagingService.instance().messageHandlers.values()); + assertTrue(inbound.count() > 0); + + // There should be outbound connections as authentication succeeds. + OutboundConnections outbound = getOnlyElement(MessagingService.instance().channelManagers.values()); + assertTrue(outbound.small.isConnected() || outbound.large.isConnected() || outbound.urgent.isConnected()); + }; + + cluster.get(1).runOnInstance(runnable); + cluster.get(2).runOnInstance(runnable); + } + } + + private Cluster.Builder createCluster(final Class authenticatorClass) + { + return builder() + .withNodes(2) + .withConfig(c -> + { + c.with(Feature.NETWORK); + c.with(Feature.NATIVE_PROTOCOL); + + HashMap encryption = new HashMap<>(); + encryption.put("keystore", "test/conf/cassandra_ssl_test.keystore"); + encryption.put("keystore_password", "cassandra"); + encryption.put("truststore", "test/conf/cassandra_ssl_test.truststore"); + encryption.put("truststore_password", "cassandra"); + encryption.put("internode_encryption", "all"); + encryption.put("require_client_auth", "true"); + c.set("server_encryption_options", encryption); + c.set("internode_authenticator", authenticatorClass.getName()); + }) + .withNodeIdTopology(ImmutableMap.of(1, NetworkTopology.dcAndRack("dc1", "r1a"), + 2, NetworkTopology.dcAndRack("dc2", "r2a"))); + } + + // Authenticator that validates certificate authentication + public static class CertificateVerifyAuthenticator implements IInternodeAuthenticator + { + @Override + public boolean authenticate(InetAddress remoteAddress, int remotePort, Certificate[] certificates, InternodeConnectionDirection connectionType) + { + try + { + // Check if the presented certificates during internode authentication are the ones in the keystores + // configured in the cassandra.yaml configuration. + KeyStore keyStore = KeyStore.getInstance("JKS"); + char[] keyStorePassword = "cassandra".toCharArray(); + InputStream keyStoreData = new FileInputStream("test/conf/cassandra_ssl_test.keystore"); + keyStore.load(keyStoreData, keyStorePassword); + return certificates != null && certificates.length != 0 && keyStore.getCertificate("cassandra_ssl_test").equals(certificates[0]); + } + catch (Exception e) + { + return false; + } + } + + @Override + public void validateConfiguration() throws ConfigurationException + { + + } + } + + public static class RejectConnectionsAuthenticator implements IInternodeAuthenticator + { + boolean authenticationFailed = false; + + @Override + public boolean authenticate(InetAddress remoteAddress, int remotePort, Certificate[] certificates, InternodeConnectionDirection connectionType) + { + authenticationFailed = true; + return false; + } + + @Override + public void validateConfiguration() throws ConfigurationException + { + + } + } + + public static class RejectInboundConnections extends RejectConnectionsAuthenticator + { + @Override + public boolean authenticate(InetAddress remoteAddress, int remotePort, Certificate[] certificates, InternodeConnectionDirection connectionType) + { + if (connectionType == InternodeConnectionDirection.INBOUND) + { + return super.authenticate(remoteAddress, remotePort, certificates, connectionType); + } + return true; + } + } + + public static class RejectOutboundAuthenticator extends RejectConnectionsAuthenticator + { + @Override + public boolean authenticate(InetAddress remoteAddress, int remotePort, Certificate[] certificates, InternodeConnectionDirection connectionType) + { + if (connectionType == InternodeConnectionDirection.OUTBOUND) + { + return super.authenticate(remoteAddress, remotePort, certificates, connectionType); + } + return true; + } + } + + public static class AllowFirstAndRejectOtherOutboundAuthenticator extends RejectOutboundAuthenticator + { + AtomicInteger successfulOutbound = new AtomicInteger(); + AtomicInteger failedOutbound = new AtomicInteger(); + + @Override + public boolean authenticate(InetAddress remoteAddress, int remotePort, Certificate[] certificates, InternodeConnectionDirection connectionType) + { + if (connectionType == InternodeConnectionDirection.OUTBOUND) + { + if(successfulOutbound.get() == 0) { + successfulOutbound.incrementAndGet(); + return true; + } else { + failedOutbound.incrementAndGet(); + return false; + } + + } + return true; + } + } } diff --git a/test/unit/org/apache/cassandra/SchemaLoader.java b/test/unit/org/apache/cassandra/SchemaLoader.java index 327949074b58..e2ef487ddf5b 100644 --- a/test/unit/org/apache/cassandra/SchemaLoader.java +++ b/test/unit/org/apache/cassandra/SchemaLoader.java @@ -286,6 +286,7 @@ public static void setupAuth(IRoleManager roleManager, IAuthenticator authentica SchemaTestUtil.announceNewKeyspace(AuthKeyspace.metadata()); DatabaseDescriptor.getRoleManager().setup(); DatabaseDescriptor.getAuthenticator().setup(); + DatabaseDescriptor.getInternodeAuthenticator().setupInternode(); DatabaseDescriptor.getAuthorizer().setup(); DatabaseDescriptor.getNetworkAuthorizer().setup(); Schema.instance.registerListener(new AuthSchemaChangeListener()); diff --git a/test/unit/org/apache/cassandra/net/MessagingServiceTest.java b/test/unit/org/apache/cassandra/net/MessagingServiceTest.java index 349d8652cfc2..32d505038868 100644 --- a/test/unit/org/apache/cassandra/net/MessagingServiceTest.java +++ b/test/unit/org/apache/cassandra/net/MessagingServiceTest.java @@ -25,6 +25,7 @@ import java.net.InetSocketAddress; import java.net.UnknownHostException; import java.nio.channels.AsynchronousSocketChannel; +import java.security.cert.Certificate; import java.util.ArrayList; import java.util.Arrays; import java.util.HashSet; @@ -35,31 +36,35 @@ import java.util.concurrent.Future; import java.util.concurrent.TimeUnit; import java.util.concurrent.atomic.AtomicInteger; -import java.util.regex.*; import java.util.regex.Matcher; +import java.util.regex.Pattern; import com.google.common.net.InetAddresses; +import org.junit.After; +import org.junit.Assert; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; import com.codahale.metrics.Timer; - import org.apache.cassandra.auth.IInternodeAuthenticator; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.config.EncryptionOptions.ServerEncryptionOptions; import org.apache.cassandra.db.commitlog.CommitLog; -import org.apache.cassandra.metrics.MessagingMetrics; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.metrics.MessagingMetrics; +import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FBUtilities; import org.awaitility.Awaitility; import org.caffinitas.ohc.histo.EstimatedHistogram; -import org.junit.After; -import org.junit.Assert; -import org.junit.Before; -import org.junit.BeforeClass; -import org.junit.Test; import static java.util.concurrent.TimeUnit.MILLISECONDS; -import static org.junit.Assert.*; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; +import static org.junit.Assert.assertTrue; public class MessagingServiceTest { @@ -67,7 +72,8 @@ public class MessagingServiceTest public static AtomicInteger rejectedConnections = new AtomicInteger(); public static final IInternodeAuthenticator ALLOW_NOTHING_AUTHENTICATOR = new IInternodeAuthenticator() { - public boolean authenticate(InetAddress remoteAddress, int remotePort) + public boolean authenticate(InetAddress remoteAddress, int remotePort, + Certificate[] certificates, InternodeConnectionDirection connectionType) { rejectedConnections.incrementAndGet(); return false; @@ -78,6 +84,25 @@ public void validateConfiguration() throws ConfigurationException } }; + + public static final IInternodeAuthenticator REJECT_OUTBOUND_AUTHENTICATOR = new IInternodeAuthenticator() + { + public boolean authenticate(InetAddress remoteAddress, int remotePort, + Certificate[] certificates, InternodeConnectionDirection connectionType) + { + if (connectionType == InternodeConnectionDirection.OUTBOUND) + { + rejectedConnections.incrementAndGet(); + return false; + } + return true; + } + + public void validateConfiguration() throws ConfigurationException + { + + } + }; private static IInternodeAuthenticator originalAuthenticator; private static ServerEncryptionOptions originalServerEncryptionOptions; private static InetAddressAndPort originalListenAddress; @@ -228,19 +253,38 @@ private static void addDCLatency(long sentAt, long nowTime) @Test public void testFailedOutboundInternodeAuth() throws Exception { - MessagingService ms = MessagingService.instance(); - DatabaseDescriptor.setInternodeAuthenticator(ALLOW_NOTHING_AUTHENTICATOR); - InetAddressAndPort address = InetAddressAndPort.getByName("127.0.0.250"); + // Listen on serverside for connections + ServerEncryptionOptions serverEncryptionOptions = new ServerEncryptionOptions() + .withInternodeEncryption(ServerEncryptionOptions.InternodeEncryption.none); + + DatabaseDescriptor.setInternodeAuthenticator(REJECT_OUTBOUND_AUTHENTICATOR); + InetAddress listenAddress = FBUtilities.getJustLocalAddress(); - //Should return null - int rejectedBefore = rejectedConnections.get(); - Message messageOut = Message.out(Verb.ECHO_REQ, NoPayload.noPayload); - ms.send(messageOut, address); - Awaitility.await().atMost(10, TimeUnit.SECONDS).until(() -> rejectedConnections.get() > rejectedBefore); + InboundConnectionSettings settings = new InboundConnectionSettings().withEncryption(serverEncryptionOptions); + InboundSockets connections = new InboundSockets(settings); - //Should tolerate null - ms.closeOutbound(address); - ms.send(messageOut, address); + try + { + connections.open().await(); + Assert.assertTrue(connections.isListening()); + + MessagingService ms = MessagingService.instance(); + //Should return null + int rejectedBefore = rejectedConnections.get(); + Message messageOut = Message.out(Verb.ECHO_REQ, NoPayload.noPayload); + InetAddressAndPort address = InetAddressAndPort.getByAddress(listenAddress); + ms.send(messageOut, address); + Awaitility.await().atMost(10, TimeUnit.SECONDS).until(() -> rejectedConnections.get() > rejectedBefore); + + //Should tolerate null + ms.closeOutbound(address); + ms.send(messageOut, address); + } + finally + { + connections.close().await(); + Assert.assertFalse(connections.isListening()); + } } @Test @@ -262,6 +306,11 @@ public void testFailedInboundInternodeAuth() throws IOException, InterruptedExce int rejectedBefore = rejectedConnections.get(); Future connectFuture = testChannel.connect(new InetSocketAddress(listenAddress, DatabaseDescriptor.getStoragePort())); + Awaitility.await().atMost(10, TimeUnit.SECONDS).until(connectFuture::isDone); + + // Since authentication doesn't happen during connect, try writing a dummy string which triggers + // authentication handler. + testChannel.write(ByteBufferUtil.bytes("dummy string")); Awaitility.await().atMost(10, TimeUnit.SECONDS).until(() -> rejectedConnections.get() > rejectedBefore); connectFuture.cancel(true); From 465547fdeda9c89199724ec2198f864cef413e9e Mon Sep 17 00:00:00 2001 From: Attila Homoki Date: Fri, 20 May 2022 09:58:11 -0400 Subject: [PATCH 016/159] Increment CQLSH to version 6.2.0 for release 4.2 Patch by Attila Homoki; reviewed by Brandon Williams and Ekaterina Dimitrova for CASSANDRA-17646 Co-authored-by: Brad Schoening <5796692+bschoening@users.noreply.github.com> --- CHANGES.txt | 1 + README.asc | 2 +- bin/cqlsh.py | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 108a911e9ef9..bac9b15dbb76 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Increment CQLSH to version 6.2.0 for release 4.2 (CASSANDRA-17646) * Adding support to perform certificate based internode authentication (CASSANDRA-17661) * Option to disable CDC writes of repaired data (CASSANDRA-17666) * When a node is bootstrapping it gets the whole gossip state but applies in random order causing some cases where StorageService will fail causing an instance to not show up in TokenMetadata (CASSANDRA-17676) diff --git a/README.asc b/README.asc index 3c40bf497002..942bb203b04b 100644 --- a/README.asc +++ b/README.asc @@ -39,7 +39,7 @@ be sitting in front of a prompt: ---- Connected to Test Cluster at localhost:9160. -[cqlsh 6.0.0 | Cassandra 4.0.2 | CQL spec 3.4.5 | Native protocol v5] +[cqlsh 6.2.0 | Cassandra 4.2-SNAPSHOT | CQL spec 3.4.5 | Native protocol v5] Use HELP for help. cqlsh> ---- diff --git a/bin/cqlsh.py b/bin/cqlsh.py index 35eb429abaeb..e47bc5951852 100755 --- a/bin/cqlsh.py +++ b/bin/cqlsh.py @@ -47,7 +47,7 @@ UTF8 = 'utf-8' description = "CQL Shell for Apache Cassandra" -version = "6.1.0" +version = "6.2.0" readline = None try: From 92069ec0932774357f5a7babaf3ec28ca1255286 Mon Sep 17 00:00:00 2001 From: Bereng Date: Fri, 3 Jun 2022 07:29:09 +0200 Subject: [PATCH 017/159] testsome target doesn't work with wildcards Patch by Bernardo Botella Corbi; reviewed by Brandon Williams and Berenguer Blasi for CASSANDRA-17083 --- build.xml | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/build.xml b/build.xml index d0466578ac06..15da1e7df1f6 100644 --- a/build.xml +++ b/build.xml @@ -1656,9 +1656,27 @@ ant testsome -Dtest.name=org.apache.cassandra.service.StorageServiceServerTest -Dtest.methods=testRegularMode,testGetAllRangesEmpty --> + + + + + + + + + + + + + + + + + + - - + + From 39e89fd636ee4343eb2201820da87881cbc749e2 Mon Sep 17 00:00:00 2001 From: Savni Nagarkar Date: Mon, 18 Apr 2022 16:50:19 -0400 Subject: [PATCH 018/159] Add guardrail for maximum replication factor MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch by Savni Nagarkar; reviewed by Andrés de la Peña, David Capwell and Josh McKenzie for CASSANDRA-17500 --- CHANGES.txt | 1 + NEWS.txt | 1 + conf/cassandra.yaml | 5 + .../org/apache/cassandra/config/Config.java | 2 + .../cassandra/config/DatabaseDescriptor.java | 5 + .../cassandra/config/GuardrailsOptions.java | 59 +++- .../cassandra/db/guardrails/Guardrails.java | 35 ++- .../db/guardrails/GuardrailsConfig.java | 9 + .../db/guardrails/GuardrailsMBean.java | 29 +- .../locator/NetworkTopologyStrategy.java | 1 + .../cassandra/locator/SimpleStrategy.java | 1 + ...GuardrailMaximumReplicationFactorTest.java | 254 ++++++++++++++++++ ...GuardrailMinimumReplicationFactorTest.java | 105 +++++--- 13 files changed, 439 insertions(+), 68 deletions(-) create mode 100644 test/unit/org/apache/cassandra/db/guardrails/GuardrailMaximumReplicationFactorTest.java diff --git a/CHANGES.txt b/CHANGES.txt index bac9b15dbb76..8dc91eee3f53 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Add guardrail for maximum replication factor (CASSANDRA-17500) * Increment CQLSH to version 6.2.0 for release 4.2 (CASSANDRA-17646) * Adding support to perform certificate based internode authentication (CASSANDRA-17661) * Option to disable CDC writes of repaired data (CASSANDRA-17666) diff --git a/NEWS.txt b/NEWS.txt index 996113d7c7eb..c9edaa2a7add 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -69,6 +69,7 @@ New features - New Guardrails added: - Whether ALTER TABLE commands are allowed to mutate columns - Whether SimpleStrategy is allowed on keyspace creation or alteration + - Maximum replication factor Upgrading --------- diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml index 3bab6712c820..eb1d1a97c021 100644 --- a/conf/cassandra.yaml +++ b/conf/cassandra.yaml @@ -1812,6 +1812,11 @@ drop_compact_storage_enabled: false # Suggested value for use in production: 2 or higher # minimum_replication_factor_warn_threshold: -1 # minimum_replication_factor_fail_threshold: -1 +# +# Guardrail to warn or fail when the maximum replication factor is greater than threshold. +# This would also apply to system keyspaces. +# maximum_replication_factor_warn_threshold: -1 +# maximum_replication_factor_fail_threshold: -1 # Startup Checks are executed as part of Cassandra startup process, not all of them # are configurable (so you can disable them) but these which are enumerated bellow. diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java index 3d2dbb7b40dc..3048a9a411dc 100644 --- a/src/java/org/apache/cassandra/config/Config.java +++ b/src/java/org/apache/cassandra/config/Config.java @@ -848,6 +848,8 @@ public static void setClientMode(boolean clientMode) public volatile DataStorageSpec.LongBytesBound data_disk_usage_max_disk_size = null; public volatile int minimum_replication_factor_warn_threshold = -1; public volatile int minimum_replication_factor_fail_threshold = -1; + public volatile int maximum_replication_factor_warn_threshold = -1; + public volatile int maximum_replication_factor_fail_threshold = -1; public volatile DurationSpec.LongNanosecondsBound streaming_state_expires = new DurationSpec.LongNanosecondsBound("3d"); public volatile DataStorageSpec.LongBytesBound streaming_state_size = new DataStorageSpec.LongBytesBound("40MiB"); diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java index 8151c968719e..2bd1aa840055 100644 --- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java +++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java @@ -4089,6 +4089,11 @@ public static void setDefaultKeyspaceRF(int value) throws IllegalArgumentExcepti throw new IllegalArgumentException(String.format("default_keyspace_rf to be set (%d) cannot be less than minimum_replication_factor_fail_threshold (%d)", value, guardrails.getMinimumReplicationFactorFailThreshold())); } + if (guardrails.getMaximumReplicationFactorFailThreshold() != -1 && value > guardrails.getMaximumReplicationFactorFailThreshold()) + { + throw new IllegalArgumentException(String.format("default_keyspace_rf to be set (%d) cannot be greater than maximum_replication_factor_fail_threshold (%d)", value, guardrails.getMaximumReplicationFactorFailThreshold())); + } + conf.default_keyspace_rf = value; } diff --git a/src/java/org/apache/cassandra/config/GuardrailsOptions.java b/src/java/org/apache/cassandra/config/GuardrailsOptions.java index e8d7bda77a16..98d14a1d3266 100644 --- a/src/java/org/apache/cassandra/config/GuardrailsOptions.java +++ b/src/java/org/apache/cassandra/config/GuardrailsOptions.java @@ -81,7 +81,8 @@ public GuardrailsOptions(Config config) validateMaxIntThreshold(config.fields_per_udt_warn_threshold, config.fields_per_udt_fail_threshold, "fields_per_udt"); validatePercentageThreshold(config.data_disk_usage_percentage_warn_threshold, config.data_disk_usage_percentage_fail_threshold, "data_disk_usage_percentage"); validateDataDiskUsageMaxDiskSize(config.data_disk_usage_max_disk_size); - validateMinRFThreshold(config.minimum_replication_factor_warn_threshold, config.minimum_replication_factor_fail_threshold, "minimum_replication_factor"); + validateMinRFThreshold(config.minimum_replication_factor_warn_threshold, config.minimum_replication_factor_fail_threshold); + validateMaxRFThreshold(config.maximum_replication_factor_warn_threshold, config.maximum_replication_factor_fail_threshold); } @Override @@ -651,7 +652,7 @@ public int getMinimumReplicationFactorFailThreshold() public void setMinimumReplicationFactorThreshold(int warn, int fail) { - validateMinRFThreshold(warn, fail, "minimum_replication_factor"); + validateMinRFThreshold(warn, fail); updatePropertyWithLogging("minimum_replication_factor_warn_threshold", warn, () -> config.minimum_replication_factor_warn_threshold, @@ -662,6 +663,31 @@ public void setMinimumReplicationFactorThreshold(int warn, int fail) x -> config.minimum_replication_factor_fail_threshold = x); } + @Override + public int getMaximumReplicationFactorWarnThreshold() + { + return config.maximum_replication_factor_warn_threshold; + } + + @Override + public int getMaximumReplicationFactorFailThreshold() + { + return config.maximum_replication_factor_fail_threshold; + } + + public void setMaximumReplicationFactorThreshold(int warn, int fail) + { + validateMaxRFThreshold(warn, fail); + updatePropertyWithLogging("maximum_replication_factor_warn_threshold", + warn, + () -> config.maximum_replication_factor_warn_threshold, + x -> config.maximum_replication_factor_warn_threshold = x); + updatePropertyWithLogging("maximum_replication_factor_fail_threshold", + fail, + () -> config.maximum_replication_factor_fail_threshold, + x -> config.maximum_replication_factor_fail_threshold = x); + } + private static void updatePropertyWithLogging(String propertyName, T newValue, Supplier getter, Consumer setter) { T oldValue = getter.get(); @@ -717,10 +743,24 @@ private static void validateMinIntThreshold(int warn, int fail, String name) validateWarnGreaterThanFail(warn, fail, name); } - private static void validateMinRFThreshold(int warn, int fail, String name) + private static void validateMinRFThreshold(int warn, int fail) { - validateMinIntThreshold(warn, fail, name); - validateMinRFVersusDefaultRF(fail, name); + validateMinIntThreshold(warn, fail, "minimum_replication_factor"); + + if (fail > DatabaseDescriptor.getDefaultKeyspaceRF()) + throw new IllegalArgumentException(format("minimum_replication_factor_fail_threshold to be set (%d) " + + "cannot be greater than default_keyspace_rf (%d)", + fail, DatabaseDescriptor.getDefaultKeyspaceRF())); + } + + private static void validateMaxRFThreshold(int warn, int fail) + { + validateMaxIntThreshold(warn, fail, "maximum_replication_factor"); + + if (fail != -1 && fail < DatabaseDescriptor.getDefaultKeyspaceRF()) + throw new IllegalArgumentException(format("maximum_replication_factor_fail_threshold to be set (%d) " + + "cannot be lesser than default_keyspace_rf (%d)", + fail, DatabaseDescriptor.getDefaultKeyspaceRF())); } private static void validateWarnLowerThanFail(long warn, long fail, String name) @@ -743,15 +783,6 @@ private static void validateWarnGreaterThanFail(long warn, long fail, String nam "than the fail threshold %d", warn, name, fail)); } - private static void validateMinRFVersusDefaultRF(int fail, String name) throws IllegalArgumentException - { - if (fail > DatabaseDescriptor.getDefaultKeyspaceRF()) - { - throw new IllegalArgumentException(String.format("%s_fail_threshold to be set (%d) cannot be greater than default_keyspace_rf (%d)", - name, fail, DatabaseDescriptor.getDefaultKeyspaceRF())); - } - } - private static void validateSize(DataStorageSpec.LongBytesBound size, boolean allowZero, String name) { if (size == null) diff --git a/src/java/org/apache/cassandra/db/guardrails/Guardrails.java b/src/java/org/apache/cassandra/db/guardrails/Guardrails.java index 16146fec87a4..36bb3d446f6b 100644 --- a/src/java/org/apache/cassandra/db/guardrails/Guardrails.java +++ b/src/java/org/apache/cassandra/db/guardrails/Guardrails.java @@ -344,10 +344,19 @@ public final class Guardrails implements GuardrailsMBean state -> CONFIG_PROVIDER.getOrCreate(state).getMinimumReplicationFactorWarnThreshold(), state -> CONFIG_PROVIDER.getOrCreate(state).getMinimumReplicationFactorFailThreshold(), (isWarning, what, value, threshold) -> - isWarning ? format("The keyspace %s has a replication factor of %s, below the warning threshold of %s.", - what, value, threshold) - : format("The keyspace %s has a replication factor of %s, below the failure threshold of %s.", - what, value, threshold)); + format("The keyspace %s has a replication factor of %s, below the %s threshold of %s.", + what, value, isWarning ? "warning" : "failure", threshold)); + + /** + * Guardrail on the maximum replication factor. + */ + public static final MaxThreshold maximumReplicationFactor = + new MaxThreshold("maximum_replication_factor", + state -> CONFIG_PROVIDER.getOrCreate(state).getMaximumReplicationFactorWarnThreshold(), + state -> CONFIG_PROVIDER.getOrCreate(state).getMaximumReplicationFactorFailThreshold(), + (isWarning, what, value, threshold) -> + format("The keyspace %s has a replication factor of %s, above the %s threshold of %s.", + what, value, isWarning ? "warning" : "failure", threshold)); private Guardrails() { @@ -857,6 +866,24 @@ public void setFieldsPerUDTThreshold(int warn, int fail) DEFAULT_CONFIG.setFieldsPerUDTThreshold(warn, fail); } + @Override + public int getMaximumReplicationFactorWarnThreshold() + { + return DEFAULT_CONFIG.getMaximumReplicationFactorWarnThreshold(); + } + + @Override + public int getMaximumReplicationFactorFailThreshold() + { + return DEFAULT_CONFIG.getMaximumReplicationFactorFailThreshold(); + } + + @Override + public void setMaximumReplicationFactorThreshold (int warn, int fail) + { + DEFAULT_CONFIG.setMaximumReplicationFactorThreshold(warn, fail); + } + @Override public int getDataDiskUsagePercentageWarnThreshold() { diff --git a/src/java/org/apache/cassandra/db/guardrails/GuardrailsConfig.java b/src/java/org/apache/cassandra/db/guardrails/GuardrailsConfig.java index 72eaaa5b487a..c7067b53e048 100644 --- a/src/java/org/apache/cassandra/db/guardrails/GuardrailsConfig.java +++ b/src/java/org/apache/cassandra/db/guardrails/GuardrailsConfig.java @@ -291,4 +291,13 @@ public interface GuardrailsConfig */ int getMinimumReplicationFactorFailThreshold(); + /** + * @return The threshold to warn when replication factor is greater than threshold. + */ + int getMaximumReplicationFactorWarnThreshold(); + + /** + * @return The threshold to fail when replication factor is greater than threshold. + */ + int getMaximumReplicationFactorFailThreshold(); } diff --git a/src/java/org/apache/cassandra/db/guardrails/GuardrailsMBean.java b/src/java/org/apache/cassandra/db/guardrails/GuardrailsMBean.java index 47db91a6fa85..dc3fb48e228f 100644 --- a/src/java/org/apache/cassandra/db/guardrails/GuardrailsMBean.java +++ b/src/java/org/apache/cassandra/db/guardrails/GuardrailsMBean.java @@ -550,21 +550,38 @@ public interface GuardrailsMBean void setDataDiskUsageMaxDiskSize(@Nullable String size); /** - * @return The threshold to warn when replication factor is lesser threshold. + * @return The threshold to warn when replication factor is lesser than threshold. */ int getMinimumReplicationFactorWarnThreshold(); /** - * @return The threshold to fail when replication factor is lesser threshold. + * @return The threshold to fail when replication factor is lesser than threshold. */ int getMinimumReplicationFactorFailThreshold(); /** - * @param warn the threshold to warn when the minimum replication factor is lesser than - * threshold -1 means disabled. - * @param fail the threshold to fail when the minimum replication factor is lesser than - * threshold -1 means disabled. + * @param warn The threshold to warn when the minimum replication factor is lesser than threshold. + * -1 means disabled. + * @param fail The threshold to fail when the minimum replication factor is lesser than threshold. + * -1 means disabled. */ void setMinimumReplicationFactorThreshold (int warn, int fail); + /** + * @return The threshold to fail when replication factor is greater than threshold. + */ + int getMaximumReplicationFactorWarnThreshold(); + + /** + * @return The threshold to fail when replication factor is greater than threshold. + */ + int getMaximumReplicationFactorFailThreshold(); + + /** + * @param warn The threshold to warn when the maximum replication factor is greater than threshold. + * -1 means disabled. + * @param fail The threshold to fail when the maximum replication factor is greater than threshold. + * -1 means disabled. + */ + void setMaximumReplicationFactorThreshold (int warn, int fail); } diff --git a/src/java/org/apache/cassandra/locator/NetworkTopologyStrategy.java b/src/java/org/apache/cassandra/locator/NetworkTopologyStrategy.java index 9ae034121a72..490c46b2f24c 100644 --- a/src/java/org/apache/cassandra/locator/NetworkTopologyStrategy.java +++ b/src/java/org/apache/cassandra/locator/NetworkTopologyStrategy.java @@ -350,6 +350,7 @@ public void maybeWarnOnOptions(ClientState state) String dc = e.getKey(); ReplicationFactor rf = getReplicationFactor(dc); Guardrails.minimumReplicationFactor.guard(rf.fullReplicas, keyspaceName, false, state); + Guardrails.maximumReplicationFactor.guard(rf.fullReplicas, keyspaceName, false, state); int nodeCount = dcsNodes.get(dc).size(); // nodeCount==0 on many tests if (rf.fullReplicas > nodeCount && nodeCount != 0) diff --git a/src/java/org/apache/cassandra/locator/SimpleStrategy.java b/src/java/org/apache/cassandra/locator/SimpleStrategy.java index e5b92103b44d..488b601ce7a9 100644 --- a/src/java/org/apache/cassandra/locator/SimpleStrategy.java +++ b/src/java/org/apache/cassandra/locator/SimpleStrategy.java @@ -109,6 +109,7 @@ public void maybeWarnOnOptions(ClientState state) int nodeCount = StorageService.instance.getHostIdToEndpoint().size(); // nodeCount==0 on many tests Guardrails.minimumReplicationFactor.guard(rf.fullReplicas, keyspaceName, false, state); + Guardrails.maximumReplicationFactor.guard(rf.fullReplicas, keyspaceName, false, state); if (rf.fullReplicas > nodeCount && nodeCount != 0) { String msg = "Your replication factor " + rf.fullReplicas diff --git a/test/unit/org/apache/cassandra/db/guardrails/GuardrailMaximumReplicationFactorTest.java b/test/unit/org/apache/cassandra/db/guardrails/GuardrailMaximumReplicationFactorTest.java new file mode 100644 index 000000000000..865ac23c79ba --- /dev/null +++ b/test/unit/org/apache/cassandra/db/guardrails/GuardrailMaximumReplicationFactorTest.java @@ -0,0 +1,254 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.guardrails; + +import java.util.Arrays; +import java.util.Collections; +import java.util.List; +import java.util.UUID; +import java.util.stream.Collectors; + +import org.junit.After; +import org.junit.Test; + +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.locator.AbstractEndpointSnitch; +import org.apache.cassandra.locator.IEndpointSnitch; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.locator.Replica; +import org.apache.cassandra.service.ClientWarn; +import org.apache.cassandra.service.StorageService; +import org.assertj.core.api.Assertions; + +import static java.lang.String.format; + +public class GuardrailMaximumReplicationFactorTest extends ThresholdTester +{ + private static final int MAXIMUM_REPLICATION_FACTOR_WARN_THRESHOLD = 2; + private static final int MAXIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD = 4; + private static final int DISABLED_GUARDRAIL = -1; + + public GuardrailMaximumReplicationFactorTest() + { + super(MAXIMUM_REPLICATION_FACTOR_WARN_THRESHOLD, + MAXIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD, + Guardrails.maximumReplicationFactor, + Guardrails::setMaximumReplicationFactorThreshold, + Guardrails::getMaximumReplicationFactorWarnThreshold, + Guardrails::getMaximumReplicationFactorFailThreshold); + } + + @After + public void cleanupTest() throws Throwable + { + execute("DROP KEYSPACE IF EXISTS ks"); + DatabaseDescriptor.setDefaultKeyspaceRF(1); + } + + @Override + protected long currentValue() + { + return Long.parseLong((Keyspace.open("ks").getReplicationStrategy()).configOptions.get("datacenter1")); + } + + @Override + protected List getWarnings() + { + List warnings = ClientWarn.instance.getWarnings(); + + // filtering out non-guardrails produced warnings + return warnings == null + ? Collections.emptyList() + : warnings.stream() + .filter(w -> !w.contains("keyspace ks is higher than the number of nodes 1 for datacenter1") && + !w.contains("When increasing replication factor you need to run a full (-full) repair to distribute the data") && + !w.contains("keyspace ks is higher than the number of nodes") && + !w.contains("Your replication factor 3 for keyspace ks is higher than the number of nodes 2 for datacenter datacenter2")) + .collect(Collectors.toList()); + } + + @Test + public void testMaxKeyspaceRFDisabled() throws Throwable + { + guardrails().setMaximumReplicationFactorThreshold(DISABLED_GUARDRAIL, DISABLED_GUARDRAIL); + assertMaxThresholdValid("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 6}"); + assertMaxThresholdValid("ALTER KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 10}"); + } + + @Test + public void testSimpleStrategyCreate() throws Throwable + { + guardrails().setMaximumReplicationFactorThreshold(MAXIMUM_REPLICATION_FACTOR_WARN_THRESHOLD, MAXIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD); + assertWarns("CREATE KEYSPACE ks WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': 3}", 3); + execute("DROP KEYSPACE IF EXISTS ks"); + assertFails("CREATE KEYSPACE ks WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': 5}", 5); + } + + @Test + public void testSimpleStrategyAlter() throws Throwable + { + guardrails().setMaximumReplicationFactorThreshold(MAXIMUM_REPLICATION_FACTOR_WARN_THRESHOLD, MAXIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD); + execute("CREATE KEYSPACE ks WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': 2}"); + assertWarns("ALTER KEYSPACE ks WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': 3}", 3); + assertFails("ALTER KEYSPACE ks WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': 5}", 5); + } + + @Test + public void testMultipleDatacenter() throws Throwable + { + IEndpointSnitch snitch = DatabaseDescriptor.getEndpointSnitch(); + DatabaseDescriptor.setEndpointSnitch(new AbstractEndpointSnitch() + { + public static final String RACK1 = ServerTestUtils.RACK1; + + @Override + public String getRack(InetAddressAndPort endpoint) { return RACK1; } + + @Override + public String getDatacenter(InetAddressAndPort endpoint) { return "datacenter2"; } + + @Override + public int compareEndpoints(InetAddressAndPort target, Replica a1, Replica a2) { return 0; } + }); + + List twoWarnings = Arrays.asList(format("The keyspace ks has a replication factor of 3, above the warning threshold of %s.", MAXIMUM_REPLICATION_FACTOR_WARN_THRESHOLD), + format("The keyspace ks has a replication factor of 3, above the warning threshold of %s.", MAXIMUM_REPLICATION_FACTOR_WARN_THRESHOLD)); + + StorageService.instance.getTokenMetadata().updateHostId(UUID.randomUUID(), InetAddressAndPort.getByName("127.0.0.255")); + guardrails().setMaximumReplicationFactorThreshold(MAXIMUM_REPLICATION_FACTOR_WARN_THRESHOLD, MAXIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD); + assertValid("CREATE KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 2, 'datacenter2' : 2}"); + execute("DROP KEYSPACE IF EXISTS ks"); + assertWarns("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 2, 'datacenter2' : 3}", 3); + execute("DROP KEYSPACE IF EXISTS ks"); + assertWarns("CREATE KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 3, 'datacenter2' : 3}", twoWarnings); + execute("DROP KEYSPACE IF EXISTS ks"); + assertFails("CREATE KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 2, 'datacenter2' : 5}", 5); + execute("DROP KEYSPACE IF EXISTS ks"); + assertFails("CREATE KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 5, 'datacenter2' : 5}", 5); + execute("DROP KEYSPACE IF EXISTS ks"); + + execute("CREATE KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 1, 'datacenter2' : 1}"); + assertValid("ALTER KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 2, 'datacenter2' : 2}"); + assertWarns("ALTER KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 2, 'datacenter2' : 3}", 3); + assertWarns("ALTER KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 3, 'datacenter2' : 3}", twoWarnings); + assertFails("ALTER KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 2, 'datacenter2' : 5}", 5); + assertFails("ALTER KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 5, 'datacenter2' : 5}", 5); + + DatabaseDescriptor.setEndpointSnitch(snitch); + } + + @Test + public void testMaxKeyspaceRFOnlyWarnBelow() throws Throwable + { + guardrails().setMaximumReplicationFactorThreshold(MAXIMUM_REPLICATION_FACTOR_WARN_THRESHOLD, DISABLED_GUARDRAIL); + assertMaxThresholdValid("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 2}"); + assertMaxThresholdValid("ALTER KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 1}"); + } + + @Test + public void testMaxKeyspaceRFOnlyWarnAbove() throws Throwable + { + guardrails().setMaximumReplicationFactorThreshold(MAXIMUM_REPLICATION_FACTOR_WARN_THRESHOLD, DISABLED_GUARDRAIL); + assertWarns("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 3}", 3); + assertWarns("ALTER KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 4}", 4); + } + + @Test + public void testMaxKeyspaceRFOnlyFailBelow() throws Throwable + { + guardrails().setMaximumReplicationFactorThreshold(DISABLED_GUARDRAIL, MAXIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD); + assertMaxThresholdValid("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 2}"); + assertMaxThresholdValid("ALTER KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 3}"); + } + + @Test + public void testMaxKeyspaceRFOnlyFailAbove() throws Throwable + { + guardrails().setMaximumReplicationFactorThreshold(DISABLED_GUARDRAIL, MAXIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD); + assertFails("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 5}", 5); + } + + @Test + public void testMaxKeyspaceRFOnlyFailAboveAlter() throws Throwable + { + guardrails().setMaximumReplicationFactorThreshold(DISABLED_GUARDRAIL, MAXIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD); + execute("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 3}"); + assertFails("ALTER KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 6}", 6); + } + + @Test + public void testMaxKeyspaceRFWarnBelow() throws Throwable + { + guardrails().setMaximumReplicationFactorThreshold(MAXIMUM_REPLICATION_FACTOR_WARN_THRESHOLD, MAXIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD); + assertMaxThresholdValid("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 1}"); + assertMaxThresholdValid("ALTER KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 2}"); + } + + @Test + public void testMaxKeyspaceRFWarnFailBetween() throws Throwable + { + guardrails().setMaximumReplicationFactorThreshold(MAXIMUM_REPLICATION_FACTOR_WARN_THRESHOLD, MAXIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD); + assertWarns("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 3}", 3); + assertWarns("ALTER KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 4}", 4); + } + + @Test + public void testMaxKeyspaceRFFailAbove() throws Throwable + { + guardrails().setMaximumReplicationFactorThreshold(MAXIMUM_REPLICATION_FACTOR_WARN_THRESHOLD, MAXIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD); + assertFails("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 5}", 5); + } + + @Test + public void testMaxKeyspaceRFFailAboveAlter() throws Throwable + { + guardrails().setMaximumReplicationFactorThreshold(MAXIMUM_REPLICATION_FACTOR_WARN_THRESHOLD, MAXIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD); + execute("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 4}"); + assertFails("ALTER KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 5}", 5); + } + + @Test + public void testMaxRFLesserThanDefaultRF() + { + DatabaseDescriptor.setDefaultKeyspaceRF(3); + Assertions.assertThatThrownBy(() -> guardrails().setMaximumReplicationFactorThreshold(1, 2)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("maximum_replication_factor_fail_threshold to be set (2) cannot be lesser than default_keyspace_rf (3)"); + + DatabaseDescriptor.setDefaultKeyspaceRF(1); + guardrails().setMaximumReplicationFactorThreshold(1, 2); + Assertions.assertThatThrownBy(() -> DatabaseDescriptor.setDefaultKeyspaceRF(3)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("default_keyspace_rf to be set (3) cannot be greater than maximum_replication_factor_fail_threshold (2)"); + } + + private void assertWarns(String query, int rf) throws Throwable + { + assertWarns(query, format("The keyspace ks has a replication factor of %d, above the warning threshold of %s.", + rf, MAXIMUM_REPLICATION_FACTOR_WARN_THRESHOLD)); + } + + private void assertFails(String query, int rf) throws Throwable + { + assertFails(query, format("The keyspace ks has a replication factor of %d, above the failure threshold of %s.", + rf, MAXIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD)); + } +} diff --git a/test/unit/org/apache/cassandra/db/guardrails/GuardrailMinimumReplicationFactorTest.java b/test/unit/org/apache/cassandra/db/guardrails/GuardrailMinimumReplicationFactorTest.java index 9c984e99f5bc..8817f9a8c625 100644 --- a/test/unit/org/apache/cassandra/db/guardrails/GuardrailMinimumReplicationFactorTest.java +++ b/test/unit/org/apache/cassandra/db/guardrails/GuardrailMinimumReplicationFactorTest.java @@ -31,7 +31,6 @@ import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.Keyspace; -import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.locator.AbstractEndpointSnitch; import org.apache.cassandra.locator.IEndpointSnitch; import org.apache.cassandra.locator.InetAddressAndPort; @@ -90,6 +89,7 @@ protected List getWarnings() { List warnings = ClientWarn.instance.getWarnings(); + // filtering out non-guardrails produced warnings return warnings == null ? Collections.emptyList() : warnings.stream() @@ -133,13 +133,21 @@ public void testMinKeyspaceRFDisabled() throws Throwable } @Test - public void testSimpleStrategy() throws Throwable + public void testSimpleStrategyCreate() throws Throwable { guardrails().setMinimumReplicationFactorThreshold(MINIMUM_REPLICATION_FACTOR_WARN_THRESHOLD, MINIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD); - assertWarns("CREATE KEYSPACE ks WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': 3}", - format("The keyspace %s has a replication factor of 3, below the warning threshold of %s.", KS, MINIMUM_REPLICATION_FACTOR_WARN_THRESHOLD)); - assertFails("ALTER KEYSPACE ks WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': 1}", - format("The keyspace %s has a replication factor of 1, below the failure threshold of %s.", KS, MINIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD)); + assertWarns("CREATE KEYSPACE ks WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': 3}", 3); + execute("DROP KEYSPACE IF EXISTS ks"); + assertFails("CREATE KEYSPACE ks WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': 1}", 1); + } + + @Test + public void testSimpleStrategyAlter() throws Throwable + { + guardrails().setMinimumReplicationFactorThreshold(MINIMUM_REPLICATION_FACTOR_WARN_THRESHOLD, MINIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD); + execute("CREATE KEYSPACE ks WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': 4}"); + assertWarns("ALTER KEYSPACE ks WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': 3}", 3); + assertFails("ALTER KEYSPACE ks WITH replication = { 'class': 'SimpleStrategy', 'replication_factor': 1}", 1); } @Test @@ -162,17 +170,26 @@ public void testMultipleDatacenter() throws Throwable List twoWarnings = Arrays.asList(format("The keyspace %s has a replication factor of 2, below the warning threshold of %d.", KS, MINIMUM_REPLICATION_FACTOR_WARN_THRESHOLD), format("The keyspace %s has a replication factor of 2, below the warning threshold of %d.", KS, MINIMUM_REPLICATION_FACTOR_WARN_THRESHOLD)); - + StorageService.instance.getTokenMetadata().updateHostId(UUID.randomUUID(), InetAddressAndPort.getByName("127.0.0.255")); guardrails().setMinimumReplicationFactorThreshold(MINIMUM_REPLICATION_FACTOR_WARN_THRESHOLD, MINIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD); - assertValid("CREATE KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 4, 'datacenter2' : 4 };"); - assertWarns("ALTER KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 4, 'datacenter2' : 2 };", - format("The keyspace %s has a replication factor of 2, below the warning threshold of %d.", KS, MINIMUM_REPLICATION_FACTOR_WARN_THRESHOLD)); - assertWarns("ALTER KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 2, 'datacenter2' : 2 };", twoWarnings); - assertFails("ALTER KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 4, 'datacenter2' : 1 };", - format("The keyspace %s has a replication factor of 1, below the failure threshold of %d.", KS, MINIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD)); - assertFails("CREATE KEYSPACE ks1 WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 1, 'datacenter2' : 1 };", - format("The keyspace ks1 has a replication factor of 1, below the failure threshold of %d.", MINIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD)); + assertValid("CREATE KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 4, 'datacenter2' : 4 }"); + execute("DROP KEYSPACE IF EXISTS ks"); + assertWarns("CREATE KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 4, 'datacenter2' : 2 }", 2); + execute("DROP KEYSPACE IF EXISTS ks"); + assertWarns("CREATE KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 2, 'datacenter2' : 2 }", twoWarnings); + execute("DROP KEYSPACE IF EXISTS ks"); + assertFails("CREATE KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 4, 'datacenter2' : 1 }", 1); + execute("DROP KEYSPACE IF EXISTS ks"); + assertFails("CREATE KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 1, 'datacenter2' : 1 }", 1); + execute("DROP KEYSPACE IF EXISTS ks"); + + execute("CREATE KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 5, 'datacenter2' : 5}"); + assertValid("ALTER KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 4, 'datacenter2' : 4 }"); + assertWarns("ALTER KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 4, 'datacenter2' : 2 }", 2); + assertWarns("ALTER KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 2, 'datacenter2' : 2 }", twoWarnings); + assertFails("ALTER KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 4, 'datacenter2' : 1 }", 1); + assertFails("ALTER KEYSPACE ks WITH replication = { 'class' : 'NetworkTopologyStrategy', 'datacenter1': 1, 'datacenter2' : 1 }", 1); DatabaseDescriptor.setEndpointSnitch(snitch); execute("DROP KEYSPACE IF EXISTS ks1"); @@ -190,10 +207,8 @@ public void testMinKeyspaceRFOnlyWarnAbove() throws Throwable public void testMinKeyspaceRFOnlyWarnBelow() throws Throwable { guardrails().setMinimumReplicationFactorThreshold(MINIMUM_REPLICATION_FACTOR_WARN_THRESHOLD, DISABLED_GUARDRAIL); - assertWarns("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 3}", - format("The keyspace %s has a replication factor of 3, below the warning threshold of %s.", KS, MINIMUM_REPLICATION_FACTOR_WARN_THRESHOLD)); - assertWarns("ALTER KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 2}", - format("The keyspace %s has a replication factor of 2, below the warning threshold of %s.", KS, MINIMUM_REPLICATION_FACTOR_WARN_THRESHOLD)); + assertWarns("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 3}", 3); + assertWarns("ALTER KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 2}", 2); } @Test @@ -208,8 +223,7 @@ public void testMinKeyspaceRFOnlyFailAbove() throws Throwable public void testMinKeyspaceRFOnlyFailBelow() throws Throwable { guardrails().setMinimumReplicationFactorThreshold(DISABLED_GUARDRAIL, MINIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD); - assertFails("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 1}", - format("The keyspace %s has a replication factor of 1, below the failure threshold of %s.", KS, MINIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD)); + assertFails("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 1}", 1); } @Test @@ -217,8 +231,7 @@ public void testMinKeyspaceRFOnlyFailBelowAlter() throws Throwable { guardrails().setMinimumReplicationFactorThreshold(DISABLED_GUARDRAIL, MINIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD); execute("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 3}"); - assertFails("ALTER KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 1}", - format("The keyspace %s has a replication factor of 1, below the failure threshold of %s.", KS, MINIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD)); + assertFails("ALTER KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 1}", 1); } @Test @@ -233,18 +246,15 @@ public void testMinKeyspaceRFWarnAbove() throws Throwable public void testMinKeyspaceRFWarnFailBetween() throws Throwable { guardrails().setMinimumReplicationFactorThreshold(MINIMUM_REPLICATION_FACTOR_WARN_THRESHOLD, MINIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD); - assertWarns("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 3}", - format("The keyspace %s has a replication factor of 3, below the warning threshold of %s.", KS, MINIMUM_REPLICATION_FACTOR_WARN_THRESHOLD)); - assertWarns("ALTER KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 2}", - format("The keyspace %s has a replication factor of 2, below the warning threshold of %s.", KS, MINIMUM_REPLICATION_FACTOR_WARN_THRESHOLD)); + assertWarns("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 3}", 3); + assertWarns("ALTER KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 2}", 2); } @Test public void testMinKeyspaceRFFailBelow() throws Throwable { guardrails().setMinimumReplicationFactorThreshold(MINIMUM_REPLICATION_FACTOR_WARN_THRESHOLD, MINIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD); - assertFails("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 1}", - format("The keyspace %s has a replication factor of 1, below the failure threshold of %s.", KS, MINIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD)); + assertFails("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 1}", 1); } @Test @@ -252,26 +262,33 @@ public void testMinKeyspaceRFFailBelowAlter() throws Throwable { guardrails().setMinimumReplicationFactorThreshold(MINIMUM_REPLICATION_FACTOR_WARN_THRESHOLD, MINIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD); execute("CREATE KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 4}"); - assertFails("ALTER KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 1}", - format("The keyspace %s has a replication factor of 1, below the failure threshold of %s.", KS, MINIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD)); + assertFails("ALTER KEYSPACE ks WITH replication = { 'class': 'NetworkTopologyStrategy', 'datacenter1': 1}", 1); } @Test public void testMinRFGreaterThanDefaultRF() { - try - { - DatabaseDescriptor.setDefaultKeyspaceRF(1); - guardrails().setMinimumReplicationFactorThreshold(MINIMUM_REPLICATION_FACTOR_WARN_THRESHOLD, MINIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD); - } - catch (IllegalArgumentException e) - { - String expectedMessage = ""; + DatabaseDescriptor.setDefaultKeyspaceRF(3); + Assertions.assertThatThrownBy(() -> guardrails().setMinimumReplicationFactorThreshold(5, 4)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("minimum_replication_factor_fail_threshold to be set (4) cannot be greater than default_keyspace_rf (3)"); + + DatabaseDescriptor.setDefaultKeyspaceRF(6); + guardrails().setMinimumReplicationFactorThreshold(5, 4); + Assertions.assertThatThrownBy(() -> DatabaseDescriptor.setDefaultKeyspaceRF(3)) + .isInstanceOf(IllegalArgumentException.class) + .hasMessageContaining("default_keyspace_rf to be set (3) cannot be less than minimum_replication_factor_fail_threshold (4)"); + } - if(guardrails().getMinimumReplicationFactorFailThreshold() > DatabaseDescriptor.getDefaultKeyspaceRF()) - expectedMessage = format("%s_fail_threshold to be set (%d) cannot be greater than default_keyspace_rf (%d)", - WHAT, guardrails().getMinimumReplicationFactorFailThreshold(), DatabaseDescriptor.getDefaultKeyspaceRF()); - Assertions.assertThat(e.getMessage()).contains(expectedMessage); - } + private void assertWarns(String query, int rf) throws Throwable + { + assertWarns(query, format("The keyspace ks has a replication factor of %d, below the warning threshold of %s.", + rf, MINIMUM_REPLICATION_FACTOR_WARN_THRESHOLD)); + } + + private void assertFails(String query, int rf) throws Throwable + { + assertFails(query, format("The keyspace ks has a replication factor of %d, below the failure threshold of %s.", + rf, MINIMUM_REPLICATION_FACTOR_FAIL_THRESHOLD)); } } From e966c45afcf8bef47df245ccb851386e5ce60505 Mon Sep 17 00:00:00 2001 From: jacek-lewandowski Date: Fri, 6 Nov 2020 14:59:56 +0100 Subject: [PATCH 019/159] ByteComparable API Provides an API for converting all values of types that can be used in primary keys to byte sequences that can be compared lexicographically by unsigned byte value (i.e. byte-comparable sequences) and back. patch by Branimir Lambov, Dimitar Dimitrov and Jacek Lewandowski; reviewed by Caleb Rackliffe, Dimitar Dimitrov, Jacek Lewandowski and Aleksey Yeschenko for CASSANDRA-6936 --- CHANGES.txt | 1 + .../org/apache/cassandra/cql3/CQL3Type.java | 2 +- .../org/apache/cassandra/cql3/Tuples.java | 7 +- .../org/apache/cassandra/cql3/UserTypes.java | 2 +- .../cql3/conditions/ColumnCondition.java | 4 +- .../cql3/selection/FieldSelector.java | 3 +- .../cassandra/db/BufferDecoratedKey.java | 26 + .../cassandra/db/ClusteringComparator.java | 271 ++++ .../apache/cassandra/db/ClusteringPrefix.java | 57 +- .../org/apache/cassandra/db/DataRange.java | 29 + .../org/apache/cassandra/db/DecoratedKey.java | 72 +- .../cassandra/db/NativeDecoratedKey.java | 36 +- .../cassandra/db/PartitionPosition.java | 25 +- .../apache/cassandra/db/SystemKeyspace.java | 3 +- .../db/columniterator/SSTableIterator.java | 1 + .../SSTableReversedIterator.java | 1 + .../db/marshal/AbstractTimeUUIDType.java | 69 +- .../cassandra/db/marshal/AbstractType.java | 89 +- .../cassandra/db/marshal/BooleanType.java | 27 +- .../db/marshal/ByteArrayAccessor.java | 7 + .../db/marshal/ByteArrayObjectFactory.java | 35 +- .../db/marshal/ByteBufferAccessor.java | 7 + .../db/marshal/ByteBufferObjectFactory.java | 25 +- .../apache/cassandra/db/marshal/ByteType.java | 17 + .../cassandra/db/marshal/CollectionType.java | 90 ++ .../cassandra/db/marshal/CompositeType.java | 95 +- .../apache/cassandra/db/marshal/DateType.java | 16 + .../cassandra/db/marshal/DecimalType.java | 204 +++ .../cassandra/db/marshal/DoubleType.java | 15 + .../db/marshal/DynamicCompositeType.java | 207 +++ .../cassandra/db/marshal/EmptyType.java | 14 + .../cassandra/db/marshal/FloatType.java | 15 + .../cassandra/db/marshal/Int32Type.java | 15 + .../cassandra/db/marshal/IntegerType.java | 307 +++++ .../cassandra/db/marshal/LexicalUUIDType.java | 43 + .../apache/cassandra/db/marshal/ListType.java | 55 +- .../apache/cassandra/db/marshal/LongType.java | 25 + .../apache/cassandra/db/marshal/MapType.java | 74 +- .../db/marshal/PartitionerDefinedOrder.java | 31 + .../cassandra/db/marshal/ReversedType.java | 62 + .../apache/cassandra/db/marshal/SetType.java | 18 +- .../cassandra/db/marshal/ShortType.java | 16 + .../cassandra/db/marshal/SimpleDateType.java | 18 + .../apache/cassandra/db/marshal/TimeType.java | 18 + .../cassandra/db/marshal/TimestampType.java | 15 + .../cassandra/db/marshal/TupleType.java | 126 +- .../apache/cassandra/db/marshal/UUIDType.java | 65 + .../apache/cassandra/db/marshal/UserType.java | 2 +- .../cassandra/db/marshal/ValueAccessor.java | 8 +- .../cassandra/db/rows/EncodingStats.java | 2 +- .../cassandra/dht/ByteOrderedPartitioner.java | 14 + .../cassandra/dht/LocalPartitioner.java | 15 + .../cassandra/dht/Murmur3Partitioner.java | 15 + .../dht/OrderPreservingPartitioner.java | 14 + .../cassandra/dht/RandomPartitioner.java | 15 + src/java/org/apache/cassandra/dht/Token.java | 55 +- .../serializers/BooleanSerializer.java | 4 +- .../serializers/CollectionSerializer.java | 5 - .../cassandra/serializers/MapSerializer.java | 4 +- .../cassandra/serializers/SetSerializer.java | 4 +- .../service/paxos/PaxosRepairHistory.java | 3 +- .../utils/bytecomparable/ByteComparable.java | 163 +++ .../utils/bytecomparable/ByteComparable.md | 693 ++++++++++ .../utils/bytecomparable/ByteSource.java | 853 ++++++++++++ .../bytecomparable/ByteSourceInverse.java | 471 +++++++ .../AbstractTypeByteSourceDecodingBench.java | 140 ++ test/unit/org/apache/cassandra/Util.java | 194 +++ .../validation/entities/TupleTypeTest.java | 6 +- .../validation/entities/UserTypesTest.java | 50 + .../db/marshal/DynamicCompositeTypeTest.java | 9 +- .../db/marshal/TypeValidationTest.java | 2 +- .../cassandra/dht/KeyCollisionTest.java | 9 + .../cassandra/dht/LengthPartitioner.java | 7 + .../cassandra/transport/SerDeserTest.java | 2 +- .../AbstractTypeByteSourceTest.java | 1015 ++++++++++++++ .../ByteSourceComparisonTest.java | 1178 +++++++++++++++++ .../ByteSourceConversionTest.java | 784 +++++++++++ .../bytecomparable/ByteSourceInverseTest.java | 397 ++++++ .../ByteSourceSequenceTest.java | 784 +++++++++++ .../bytecomparable/ByteSourceTestBase.java | 513 +++++++ .../DecoratedKeyByteSourceTest.java | 85 ++ 81 files changed, 9730 insertions(+), 145 deletions(-) create mode 100644 src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.java create mode 100644 src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.md create mode 100644 src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java create mode 100644 src/java/org/apache/cassandra/utils/bytecomparable/ByteSourceInverse.java create mode 100644 test/microbench/org/apache/cassandra/test/microbench/AbstractTypeByteSourceDecodingBench.java create mode 100644 test/unit/org/apache/cassandra/utils/bytecomparable/AbstractTypeByteSourceTest.java create mode 100644 test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceComparisonTest.java create mode 100644 test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceConversionTest.java create mode 100644 test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceInverseTest.java create mode 100644 test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceSequenceTest.java create mode 100644 test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceTestBase.java create mode 100644 test/unit/org/apache/cassandra/utils/bytecomparable/DecoratedKeyByteSourceTest.java diff --git a/CHANGES.txt b/CHANGES.txt index 6a7eae004d47..fc0e2dc86400 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Add ByteComparable API (CASSANDRA-6936) * Add guardrail for maximum replication factor (CASSANDRA-17500) * Increment CQLSH to version 6.2.0 for release 4.2 (CASSANDRA-17646) * Adding support to perform certificate based internode authentication (CASSANDRA-17661) diff --git a/src/java/org/apache/cassandra/cql3/CQL3Type.java b/src/java/org/apache/cassandra/cql3/CQL3Type.java index 1d792b24531b..1c20e6b0ff1e 100644 --- a/src/java/org/apache/cassandra/cql3/CQL3Type.java +++ b/src/java/org/apache/cassandra/cql3/CQL3Type.java @@ -201,7 +201,7 @@ public String toCQLLiteral(ByteBuffer buffer, ProtocolVersion version) StringBuilder target = new StringBuilder(); buffer = buffer.duplicate(); - int size = CollectionSerializer.readCollectionSize(buffer, version); + int size = CollectionSerializer.readCollectionSize(buffer, ByteBufferAccessor.instance, version); buffer.position(buffer.position() + CollectionSerializer.sizeOfCollectionSize(size, version)); switch (type.kind) diff --git a/src/java/org/apache/cassandra/cql3/Tuples.java b/src/java/org/apache/cassandra/cql3/Tuples.java index b8acd5954af4..6e028c274d31 100644 --- a/src/java/org/apache/cassandra/cql3/Tuples.java +++ b/src/java/org/apache/cassandra/cql3/Tuples.java @@ -154,14 +154,14 @@ public Value(ByteBuffer[] elements) public static Value fromSerialized(ByteBuffer bytes, TupleType type) { - ByteBuffer[] values = type.split(bytes); + ByteBuffer[] values = type.split(ByteBufferAccessor.instance, bytes); if (values.length > type.size()) { throw new InvalidRequestException(String.format( "Tuple value contained too many fields (expected %s, got %s)", type.size(), values.length)); } - return new Value(type.split(bytes)); + return new Value(type.split(ByteBufferAccessor.instance, bytes)); } public ByteBuffer get(ProtocolVersion protocolVersion) @@ -272,7 +272,8 @@ public static InValue fromSerialized(ByteBuffer value, ListType type, QueryOptio // type.split(bytes) List> elements = new ArrayList<>(l.size()); for (Object element : l) - elements.add(Arrays.asList(tupleType.split(type.getElementsType().decompose(element)))); + elements.add(Arrays.asList(tupleType.split(ByteBufferAccessor.instance, + type.getElementsType().decompose(element)))); return new InValue(elements); } catch (MarshalException e) diff --git a/src/java/org/apache/cassandra/cql3/UserTypes.java b/src/java/org/apache/cassandra/cql3/UserTypes.java index b023a8a0b8f6..a63420fca3cd 100644 --- a/src/java/org/apache/cassandra/cql3/UserTypes.java +++ b/src/java/org/apache/cassandra/cql3/UserTypes.java @@ -217,7 +217,7 @@ public Value(UserType type, ByteBuffer[] elements) public static Value fromSerialized(ByteBuffer bytes, UserType type) { type.validate(bytes); - return new Value(type, type.split(bytes)); + return new Value(type, type.split(ByteBufferAccessor.instance, bytes)); } public ByteBuffer get(ProtocolVersion protocolVersion) diff --git a/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java b/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java index e3f463a25521..68cf2d3782b0 100644 --- a/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java +++ b/src/java/org/apache/cassandra/cql3/conditions/ColumnCondition.java @@ -650,8 +650,8 @@ private ByteBuffer rowValue(Row row) Cell cell = getCell(row, column); return cell == null - ? null - : userType.split(cell.buffer())[userType.fieldPosition(field)]; + ? null + : userType.split(ByteBufferAccessor.instance, cell.buffer())[userType.fieldPosition(field)]; } private boolean isSatisfiedBy(ByteBuffer rowValue) diff --git a/src/java/org/apache/cassandra/cql3/selection/FieldSelector.java b/src/java/org/apache/cassandra/cql3/selection/FieldSelector.java index 0c623976810e..ddcc868cf3a1 100644 --- a/src/java/org/apache/cassandra/cql3/selection/FieldSelector.java +++ b/src/java/org/apache/cassandra/cql3/selection/FieldSelector.java @@ -27,6 +27,7 @@ import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; import org.apache.cassandra.db.marshal.UserType; import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.io.util.DataInputPlus; @@ -108,7 +109,7 @@ public ByteBuffer getOutput(ProtocolVersion protocolVersion) ByteBuffer value = selected.getOutput(protocolVersion); if (value == null) return null; - ByteBuffer[] buffers = type.split(value); + ByteBuffer[] buffers = type.split(ByteBufferAccessor.instance, value); return field < buffers.length ? buffers[field] : null; } diff --git a/src/java/org/apache/cassandra/db/BufferDecoratedKey.java b/src/java/org/apache/cassandra/db/BufferDecoratedKey.java index d375162240d7..ae3e9d44e08a 100644 --- a/src/java/org/apache/cassandra/db/BufferDecoratedKey.java +++ b/src/java/org/apache/cassandra/db/BufferDecoratedKey.java @@ -19,7 +19,9 @@ import java.nio.ByteBuffer; +import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.dht.Token; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; public class BufferDecoratedKey extends DecoratedKey { @@ -36,4 +38,28 @@ public ByteBuffer getKey() { return key; } + + /** + * A factory method that translates the given byte-comparable representation to a {@link BufferDecoratedKey} + * instance. If the given byte comparable doesn't represent the encoding of a buffer decorated key, anything from a + * wide variety of throwables may be thrown (e.g. {@link AssertionError}, {@link IndexOutOfBoundsException}, + * {@link IllegalStateException}, etc.). + * + * @param byteComparable A byte-comparable representation (presumably of a {@link BufferDecoratedKey} instance). + * @param version The encoding version used for the given byte comparable. + * @param partitioner The partitioner of the encoded decorated key. Needed in order to correctly decode the token + * bytes of the key. + * @return A new {@link BufferDecoratedKey} instance, corresponding to the given byte-comparable representation. If + * we were to call {@link #asComparableBytes(Version)} on the returned object, we should get a {@link ByteSource} + * equal to the one of the input byte comparable. + */ + public static BufferDecoratedKey fromByteComparable(ByteComparable byteComparable, + Version version, + IPartitioner partitioner) + { + return DecoratedKey.fromByteComparable(byteComparable, + version, + partitioner, + (token, keyBytes) -> new BufferDecoratedKey(token, ByteBuffer.wrap(keyBytes))); + } } diff --git a/src/java/org/apache/cassandra/db/ClusteringComparator.java b/src/java/org/apache/cassandra/db/ClusteringComparator.java index fdc450813ff2..c1aebfad775f 100644 --- a/src/java/org/apache/cassandra/db/ClusteringComparator.java +++ b/src/java/org/apache/cassandra/db/ClusteringComparator.java @@ -18,6 +18,7 @@ package org.apache.cassandra.db; import java.nio.ByteBuffer; +import java.util.Arrays; import java.util.Comparator; import java.util.List; import java.util.Objects; @@ -31,6 +32,15 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.io.sstable.IndexInfo; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; + +import static org.apache.cassandra.utils.bytecomparable.ByteSource.EXCLUDED; +import static org.apache.cassandra.utils.bytecomparable.ByteSource.NEXT_COMPONENT; +import static org.apache.cassandra.utils.bytecomparable.ByteSource.NEXT_COMPONENT_EMPTY; +import static org.apache.cassandra.utils.bytecomparable.ByteSource.NEXT_COMPONENT_EMPTY_REVERSED; +import static org.apache.cassandra.utils.bytecomparable.ByteSource.NEXT_COMPONENT_NULL; +import static org.apache.cassandra.utils.bytecomparable.ByteSource.TERMINATOR; /** * A comparator of clustering prefixes (or more generally of {@link Clusterable}}. @@ -232,6 +242,267 @@ public void validate(ClusteringPrefix clustering) } } + /** + * Produce a prefix-free byte-comparable representation of the given value, i.e. such a sequence of bytes that any + * pair x, y of valid values of this type + * compare(x, y) == compareLexicographicallyUnsigned(asByteComparable(x), asByteComparable(y)) + * and + * asByteComparable(x) is not a prefix of asByteComparable(y) + */ + public ByteComparable asByteComparable(ClusteringPrefix clustering) + { + return new ByteComparableClustering<>(clustering); + } + + /** + * A prefix-free byte-comparable representation for a clustering or prefix. + * + * Adds a NEXT_COMPONENT byte before each component (allowing inclusive/exclusive bounds over incomplete prefixes + * of that length) and finishes with a suitable byte for the clustering kind. Also deals with null entries. + * + * Since all types' encodings are weakly prefix-free, this is guaranteed to be prefix-free as long as the + * bound/ClusteringPrefix terminators are different from the separator byte. It is okay for the terminator for + * Clustering to be the same as the separator, as all Clusterings must be completely specified. + * + * See also {@link AbstractType#asComparableBytes}. + * + * Some examples: + * "A", 0005, Clustering -> 40 4100 40 0005 40 + * "B", 0006, InclusiveEnd -> 40 4200 40 0006 60 + * "A", ExclusiveStart -> 40 4100 60 + * "", null, Clustering -> 40 00 3F 40 + * "", 0000, Clustering -> 40 00 40 0000 40 + * BOTTOM -> 20 + */ + private class ByteComparableClustering implements ByteComparable + { + private final ClusteringPrefix src; + + ByteComparableClustering(ClusteringPrefix src) + { + this.src = src; + } + + @Override + public ByteSource asComparableBytes(Version version) + { + return new ByteSource() + { + private ByteSource current = null; + private int srcnum = -1; + + @Override + public int next() + { + if (current != null) + { + int b = current.next(); + if (b > END_OF_STREAM) + return b; + current = null; + } + + int sz = src.size(); + if (srcnum == sz) + return END_OF_STREAM; + + ++srcnum; + if (srcnum == sz) + return src.kind().asByteComparableValue(version); + + final V nextComponent = src.get(srcnum); + // We can have a null as the clustering component (this is a relic of COMPACT STORAGE, but also + // can appear in indexed partitions with no rows but static content), + if (nextComponent == null) + { + if (version != Version.LEGACY) + return NEXT_COMPONENT_NULL; // always sorts before non-nulls, including for reversed types + else + { + // legacy version did not permit nulls in clustering keys and treated these as null values + return subtype(srcnum).isReversed() ? NEXT_COMPONENT_EMPTY_REVERSED : NEXT_COMPONENT_EMPTY; + } + } + + current = subtype(srcnum).asComparableBytes(src.accessor(), nextComponent, version); + // and also null values for some types (e.g. int, varint but not text) that are encoded as empty + // buffers. + if (current == null) + return subtype(srcnum).isReversed() ? NEXT_COMPONENT_EMPTY_REVERSED : NEXT_COMPONENT_EMPTY; + + return NEXT_COMPONENT; + } + }; + } + + public String toString() + { + return src.clusteringString(subtypes()); + } + } + + /** + * Produces a clustering from the given byte-comparable value. The method will throw an exception if the value + * does not correctly encode a clustering of this type, including if it encodes a position before or after a + * clustering (i.e. a bound/boundary). + * + * @param accessor Accessor to use to construct components. + * @param comparable The clustering encoded as a byte-comparable sequence. + */ + public Clustering clusteringFromByteComparable(ValueAccessor accessor, ByteComparable comparable) + { + ByteComparable.Version version = ByteComparable.Version.OSS42; + ByteSource.Peekable orderedBytes = ByteSource.peekable(comparable.asComparableBytes(version)); + + // First check for special cases (partition key only, static clustering) that can do without buffers. + int sep = orderedBytes.next(); + switch (sep) + { + case TERMINATOR: + assert size() == 0 : "Terminator should be after " + size() + " components, got 0"; + return accessor.factory().clustering(); + case EXCLUDED: + return accessor.factory().staticClustering(); + default: + // continue with processing + } + + int cc = 0; + V[] components = accessor.createArray(size()); + + while (true) + { + switch (sep) + { + case NEXT_COMPONENT_NULL: + components[cc] = null; + break; + case NEXT_COMPONENT_EMPTY: + case NEXT_COMPONENT_EMPTY_REVERSED: + components[cc] = subtype(cc).fromComparableBytes(accessor, null, version); + break; + case NEXT_COMPONENT: + // Decode the next component, consuming bytes from orderedBytes. + components[cc] = subtype(cc).fromComparableBytes(accessor, orderedBytes, version); + break; + case TERMINATOR: + assert cc == size() : "Terminator should be after " + size() + " components, got " + cc; + return accessor.factory().clustering(components); + case EXCLUDED: + throw new AssertionError("Unexpected static terminator after the first component"); + default: + throw new AssertionError("Unexpected separator " + Integer.toHexString(sep) + " in Clustering encoding"); + } + ++cc; + sep = orderedBytes.next(); + } + } + + /** + * Produces a clustering bound from the given byte-comparable value. The method will throw an exception if the value + * does not correctly encode a bound position of this type, including if it encodes an exact clustering. + * + * Note that the encoded clustering position cannot specify the type of bound (i.e. start/end/boundary) because to + * correctly compare clustering positions the encoding must be the same for the different types (e.g. the position + * for a exclusive end and an inclusive start is the same, before the exact clustering). The type must be supplied + * separately (in the bound... vs boundary... call and isEnd argument). + * + * @param accessor Accessor to use to construct components. + * @param comparable The clustering position encoded as a byte-comparable sequence. + * @param isEnd true if the bound marks the end of a range, false is it marks the start. + */ + public ClusteringBound boundFromByteComparable(ValueAccessor accessor, + ByteComparable comparable, + boolean isEnd) + { + ByteComparable.Version version = ByteComparable.Version.OSS42; + ByteSource.Peekable orderedBytes = ByteSource.peekable(comparable.asComparableBytes(version)); + + int sep = orderedBytes.next(); + int cc = 0; + V[] components = accessor.createArray(size()); + + while (true) + { + switch (sep) + { + case NEXT_COMPONENT_NULL: + components[cc] = null; + break; + case NEXT_COMPONENT_EMPTY: + case NEXT_COMPONENT_EMPTY_REVERSED: + components[cc] = subtype(cc).fromComparableBytes(accessor, null, version); + break; + case NEXT_COMPONENT: + // Decode the next component, consuming bytes from orderedBytes. + components[cc] = subtype(cc).fromComparableBytes(accessor, orderedBytes, version); + break; + case ByteSource.LT_NEXT_COMPONENT: + return accessor.factory().bound(isEnd ? ClusteringPrefix.Kind.EXCL_END_BOUND + : ClusteringPrefix.Kind.INCL_START_BOUND, + Arrays.copyOf(components, cc)); + case ByteSource.GT_NEXT_COMPONENT: + return accessor.factory().bound(isEnd ? ClusteringPrefix.Kind.INCL_END_BOUND + : ClusteringPrefix.Kind.EXCL_START_BOUND, + Arrays.copyOf(components, cc)); + default: + throw new AssertionError("Unexpected separator " + Integer.toHexString(sep) + " in ClusteringBound encoding"); + } + ++cc; + sep = orderedBytes.next(); + } + } + + /** + * Produces a clustering boundary from the given byte-comparable value. The method will throw an exception if the + * value does not correctly encode a bound position of this type, including if it encodes an exact clustering. + * + * Note that the encoded clustering position cannot specify the type of bound (i.e. start/end/boundary) because to + * correctly compare clustering positions the encoding must be the same for the different types (e.g. the position + * for a exclusive end and an inclusive start is the same, before the exact clustering). The type must be supplied + * separately (in the bound... vs boundary... call and isEnd argument). + * + * @param accessor Accessor to use to construct components. + * @param comparable The clustering position encoded as a byte-comparable sequence. + */ + public ClusteringBoundary boundaryFromByteComparable(ValueAccessor accessor, ByteComparable comparable) + { + ByteComparable.Version version = ByteComparable.Version.OSS42; + ByteSource.Peekable orderedBytes = ByteSource.peekable(comparable.asComparableBytes(version)); + + int sep = orderedBytes.next(); + int cc = 0; + V[] components = accessor.createArray(size()); + + while (true) + { + switch (sep) + { + case NEXT_COMPONENT_NULL: + components[cc] = null; + break; + case NEXT_COMPONENT_EMPTY: + case NEXT_COMPONENT_EMPTY_REVERSED: + components[cc] = subtype(cc).fromComparableBytes(accessor, null, version); + break; + case NEXT_COMPONENT: + // Decode the next component, consuming bytes from orderedBytes. + components[cc] = subtype(cc).fromComparableBytes(accessor, orderedBytes, version); + break; + case ByteSource.LT_NEXT_COMPONENT: + return accessor.factory().boundary(ClusteringPrefix.Kind.EXCL_END_INCL_START_BOUNDARY, + Arrays.copyOf(components, cc)); + case ByteSource.GT_NEXT_COMPONENT: + return accessor.factory().boundary(ClusteringPrefix.Kind.INCL_END_EXCL_START_BOUNDARY, + Arrays.copyOf(components, cc)); + default: + throw new AssertionError("Unexpected separator " + Integer.toHexString(sep) + " in ClusteringBoundary encoding"); + } + ++cc; + sep = orderedBytes.next(); + } + } + /** * A comparator for rows. * diff --git a/src/java/org/apache/cassandra/db/ClusteringPrefix.java b/src/java/org/apache/cassandra/db/ClusteringPrefix.java index a1291c889f1d..c7a2782ecef3 100644 --- a/src/java/org/apache/cassandra/db/ClusteringPrefix.java +++ b/src/java/org/apache/cassandra/db/ClusteringPrefix.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.*; +import java.util.function.ToIntFunction; import org.apache.cassandra.cache.IMeasurableMemory; import org.apache.cassandra.config.*; @@ -34,6 +35,8 @@ import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.utils.ByteArrayUtil; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version; +import org.apache.cassandra.utils.bytecomparable.ByteSource; /** * A clustering prefix is the unit of what a {@link ClusteringComparator} can compare. @@ -62,14 +65,19 @@ public enum Kind { // WARNING: the ordering of that enum matters because we use ordinal() in the serialization - EXCL_END_BOUND (0, -1), - INCL_START_BOUND (0, -1), - EXCL_END_INCL_START_BOUNDARY(0, -1), - STATIC_CLUSTERING (1, -1), - CLUSTERING (2, 0), - INCL_END_EXCL_START_BOUNDARY(3, 1), - INCL_END_BOUND (3, 1), - EXCL_START_BOUND (3, 1); + EXCL_END_BOUND (0, -1, v -> ByteSource.LT_NEXT_COMPONENT), + INCL_START_BOUND (0, -1, v -> ByteSource.LT_NEXT_COMPONENT), + EXCL_END_INCL_START_BOUNDARY(0, -1, v -> ByteSource.LT_NEXT_COMPONENT), + STATIC_CLUSTERING (1, -1, v -> v == Version.LEGACY + ? ByteSource.LT_NEXT_COMPONENT + 1 + : ByteSource.EXCLUDED), + CLUSTERING (2, 0, v -> v == Version.LEGACY + ? ByteSource.NEXT_COMPONENT + : ByteSource.TERMINATOR), + INCL_END_EXCL_START_BOUNDARY(3, 1, v -> ByteSource.GT_NEXT_COMPONENT), + INCL_END_BOUND (3, 1, v -> ByteSource.GT_NEXT_COMPONENT), + EXCL_START_BOUND (3, 1, v -> ByteSource.GT_NEXT_COMPONENT); + private final int comparison; @@ -79,10 +87,13 @@ public enum Kind */ public final int comparedToClustering; - Kind(int comparison, int comparedToClustering) + public final ToIntFunction asByteComparable; + + Kind(int comparison, int comparedToClustering, ToIntFunction asByteComparable) { this.comparison = comparison; this.comparedToClustering = comparedToClustering; + this.asByteComparable = asByteComparable; } /** @@ -197,6 +208,16 @@ public Kind openBoundOfBoundary(boolean reversed) ? (this == INCL_END_EXCL_START_BOUNDARY ? INCL_END_BOUND : EXCL_END_BOUND) : (this == INCL_END_EXCL_START_BOUNDARY ? EXCL_START_BOUND : INCL_START_BOUND); } + + /* + * Returns a terminator value for this clustering type that is suitable for byte comparison. + * Inclusive starts / exclusive ends need a lower value than ByteSource.NEXT_COMPONENT and the clustering byte, + * exclusive starts / inclusive ends -- a higher. + */ + public int asByteComparableValue(Version version) + { + return asByteComparable.applyAsInt(version); + } } default boolean isBottom() @@ -308,6 +329,24 @@ default ByteBuffer serializeAsPartitionKey() values[i] = accessor().toBuffer(get(i)); return CompositeType.build(ByteBufferAccessor.instance, values); } + + /** + * Produce a human-readable representation of the clustering given the list of types. + * Easier to access than metadata for debugging. + */ + public default String clusteringString(List> types) + { + StringBuilder sb = new StringBuilder(); + sb.append(kind()).append('('); + for (int i = 0; i < size(); i++) + { + if (i > 0) + sb.append(", "); + sb.append(types.get(i).getString(get(i), accessor())); + } + return sb.append(')').toString(); + } + /** * The values of this prefix as an array. *

diff --git a/src/java/org/apache/cassandra/db/DataRange.java b/src/java/org/apache/cassandra/db/DataRange.java index 52162be72a3f..9912ac56e919 100644 --- a/src/java/org/apache/cassandra/db/DataRange.java +++ b/src/java/org/apache/cassandra/db/DataRange.java @@ -27,6 +27,7 @@ import org.apache.cassandra.dht.*; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; /** * Groups both the range of partitions to query, and the clustering index filter to @@ -138,6 +139,34 @@ public PartitionPosition stopKey() return keyRange.right; } + /** + * The start of the partition key range queried by this {@code DataRange}. + * + * @return the start of the partition key range expressed as a ByteComparable. + */ + public ByteComparable startAsByteComparable() + { + PartitionPosition bound = keyRange.left; + if (bound.isMinimum()) + return null; + + return bound.asComparableBound(keyRange.inclusiveLeft()); + } + + /** + * The end of the partition key range queried by this {@code DataRange}. + * + * @return the end of the partition key range expressed as a ByteComparable. + */ + public ByteComparable stopAsByteComparable() + { + PartitionPosition bound = keyRange.right; + if (bound.isMinimum()) + return null; + + return bound.asComparableBound(!keyRange.inclusiveRight()); + } + /** * Whether the underlying clustering index filter is a names filter or not. * diff --git a/src/java/org/apache/cassandra/db/DecoratedKey.java b/src/java/org/apache/cassandra/db/DecoratedKey.java index 4dd87d0e2c85..569c86d9d832 100644 --- a/src/java/org/apache/cassandra/db/DecoratedKey.java +++ b/src/java/org/apache/cassandra/db/DecoratedKey.java @@ -21,6 +21,7 @@ import java.util.Comparator; import java.util.List; import java.util.StringJoiner; +import java.util.function.BiFunction; import org.apache.cassandra.db.marshal.CompositeType; import org.apache.cassandra.dht.IPartitioner; @@ -29,8 +30,11 @@ import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.utils.ByteBufferUtil; -import org.apache.cassandra.utils.MurmurHash; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; import org.apache.cassandra.utils.IFilter.FilterKey; +import org.apache.cassandra.utils.MurmurHash; /** * Represents a decorated key, handy for certain operations @@ -102,6 +106,37 @@ public static int compareTo(IPartitioner partitioner, ByteBuffer key, PartitionP return cmp == 0 ? ByteBufferUtil.compareUnsigned(key, otherKey.getKey()) : cmp; } + @Override + public ByteSource asComparableBytes(Version version) + { + // Note: In the legacy version one encoding could be a prefix of another as the escaping is only weakly + // prefix-free (see ByteSourceTest.testDecoratedKeyPrefixes()). + // The OSS42 version avoids this by adding a terminator. + return ByteSource.withTerminatorMaybeLegacy(version, + ByteSource.END_OF_STREAM, + token.asComparableBytes(version), + keyComparableBytes(version)); + } + + @Override + public ByteComparable asComparableBound(boolean before) + { + return version -> + { + assert (version != Version.LEGACY) : "Decorated key bounds are not supported by the legacy encoding."; + + return ByteSource.withTerminator( + before ? ByteSource.LT_NEXT_COMPONENT : ByteSource.GT_NEXT_COMPONENT, + token.asComparableBytes(version), + keyComparableBytes(version)); + }; + } + + protected ByteSource keyComparableBytes(Version version) + { + return ByteSource.of(getKey(), version); + } + public IPartitioner getPartitioner() { return getToken().getPartitioner(); @@ -169,4 +204,39 @@ public void filterHash(long[] dest) ByteBuffer key = getKey(); MurmurHash.hash3_x64_128(key, key.position(), key.remaining(), 0, dest); } + + /** + * A template factory method for creating decorated keys from their byte-comparable representation. + */ + static T fromByteComparable(ByteComparable byteComparable, + Version version, + IPartitioner partitioner, + BiFunction decoratedKeyFactory) + { + ByteSource.Peekable peekable = ByteSource.peekable(byteComparable.asComparableBytes(version)); + // Decode the token from the first component of the multi-component sequence representing the whole decorated key. + Token token = partitioner.getTokenFactory().fromComparableBytes(ByteSourceInverse.nextComponentSource(peekable), version); + // Decode the key bytes from the second component. + byte[] keyBytes = ByteSourceInverse.getUnescapedBytes(ByteSourceInverse.nextComponentSource(peekable)); + // Consume the terminator byte. + int terminator = peekable.next(); + assert terminator == ByteSource.TERMINATOR : "Decorated key encoding must end in terminator."; + // Instantiate a decorated key from the decoded token and key bytes, using the provided factory method. + return decoratedKeyFactory.apply(token, keyBytes); + } + + public static byte[] keyFromByteSource(ByteSource.Peekable peekableByteSource, + Version version, + IPartitioner partitioner) + { + assert version != Version.LEGACY; // reverse translation is not supported for LEGACY version. + // Decode the token from the first component of the multi-component sequence representing the whole decorated key. + // We won't use it, but the decoding also positions the byte source after it. + partitioner.getTokenFactory().fromComparableBytes(ByteSourceInverse.nextComponentSource(peekableByteSource), version); + // Decode the key bytes from the second component. + byte[] keyBytes = ByteSourceInverse.getUnescapedBytes(ByteSourceInverse.nextComponentSource(peekableByteSource)); + int terminator = peekableByteSource.next(); + assert terminator == ByteSource.TERMINATOR : "Decorated key encoding must end in terminator."; + return keyBytes; + } } diff --git a/src/java/org/apache/cassandra/db/NativeDecoratedKey.java b/src/java/org/apache/cassandra/db/NativeDecoratedKey.java index add52189776c..e9a564a5092a 100644 --- a/src/java/org/apache/cassandra/db/NativeDecoratedKey.java +++ b/src/java/org/apache/cassandra/db/NativeDecoratedKey.java @@ -20,7 +20,11 @@ import java.nio.ByteBuffer; import java.nio.ByteOrder; +import net.nicoulaj.compilecommand.annotations.Inline; +import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.dht.Token; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; import org.apache.cassandra.utils.concurrent.OpOrder; import org.apache.cassandra.utils.memory.MemoryUtil; import org.apache.cassandra.utils.memory.NativeAllocator; @@ -41,8 +45,38 @@ public NativeDecoratedKey(Token token, NativeAllocator allocator, OpOrder.Group MemoryUtil.setBytes(peer + 4, key); } + public NativeDecoratedKey(Token token, NativeAllocator allocator, OpOrder.Group writeOp, byte[] keyBytes) + { + super(token); + assert keyBytes != null; + + int size = keyBytes.length; + this.peer = allocator.allocate(4 + size, writeOp); + MemoryUtil.setInt(peer, size); + MemoryUtil.setBytes(peer + 4, keyBytes, 0, size); + } + + @Inline + int length() + { + return MemoryUtil.getInt(peer); + } + + @Inline + long address() + { + return this.peer + 4; + } + + @Override public ByteBuffer getKey() { - return MemoryUtil.getByteBuffer(peer + 4, MemoryUtil.getInt(peer), ByteOrder.BIG_ENDIAN); + return MemoryUtil.getByteBuffer(address(), length(), ByteOrder.BIG_ENDIAN); + } + + @Override + protected ByteSource keyComparableBytes(Version version) + { + return ByteSource.ofMemory(address(), length(), version); } } diff --git a/src/java/org/apache/cassandra/db/PartitionPosition.java b/src/java/org/apache/cassandra/db/PartitionPosition.java index 3b45c6c0e2eb..5e1d6184e22a 100644 --- a/src/java/org/apache/cassandra/db/PartitionPosition.java +++ b/src/java/org/apache/cassandra/db/PartitionPosition.java @@ -24,8 +24,10 @@ import org.apache.cassandra.dht.*; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; -public interface PartitionPosition extends RingPosition +public interface PartitionPosition extends RingPosition, ByteComparable { public static enum Kind { @@ -54,6 +56,27 @@ public static PartitionPosition get(ByteBuffer key, IPartitioner p) public Kind kind(); public boolean isMinimum(); + /** + * Produce a prefix-free byte-comparable representation of the key, i.e. such a sequence of bytes that any pair x, y + * of valid positions (with the same key column types and partitioner), + * x.compareTo(y) == compareLexicographicallyUnsigned(x.asComparableBytes(), y.asComparableBytes()) + * and + * x.asComparableBytes() is not a prefix of y.asComparableBytes() + * + * We use a two-component tuple for decorated keys, and a one-component tuple for key bounds, where the terminator + * byte is chosen to yield the correct comparison result. No decorated key can be a prefix of another (per the tuple + * encoding), and no key bound can be a prefix of one because it uses a terminator byte that is different from the + * tuple separator. + */ + public abstract ByteSource asComparableBytes(Version version); + + /** + * Produce a byte-comparable representation for the position before or after the key. + * This does nothing for token boundaries (which are already at a position between valid keys), and changes + * the terminator byte for keys. + */ + public abstract ByteComparable asComparableBound(boolean before); + public static class RowPositionSerializer implements IPartitionerDependentSerializer { /* diff --git a/src/java/org/apache/cassandra/db/SystemKeyspace.java b/src/java/org/apache/cassandra/db/SystemKeyspace.java index 6fbbc3e621c0..a1013e795515 100644 --- a/src/java/org/apache/cassandra/db/SystemKeyspace.java +++ b/src/java/org/apache/cassandra/db/SystemKeyspace.java @@ -65,6 +65,7 @@ import org.apache.cassandra.cql3.statements.schema.CreateTableStatement; import org.apache.cassandra.db.commitlog.CommitLogPosition; import org.apache.cassandra.db.compaction.CompactionHistoryTabularData; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; import org.apache.cassandra.db.marshal.BytesType; import org.apache.cassandra.db.marshal.LongType; import org.apache.cassandra.db.marshal.TimeUUIDType; @@ -1875,7 +1876,7 @@ public static TopPartitionTracker.StoredTopPartitions getTopPartitions(TableMeta TupleType tupleType = new TupleType(Lists.newArrayList(UTF8Type.instance, LongType.instance)); for (ByteBuffer bb : top) { - ByteBuffer[] components = tupleType.split(bb); + ByteBuffer[] components = tupleType.split(ByteBufferAccessor.instance, bb); String keyStr = UTF8Type.instance.compose(components[0]); long value = LongType.instance.compose(components[1]); topPartitions.add(new TopPartitionTracker.TopPartition(metadata.partitioner.decorateKey(metadata.partitionKeyType.fromString(keyStr)), value)); diff --git a/src/java/org/apache/cassandra/db/columniterator/SSTableIterator.java b/src/java/org/apache/cassandra/db/columniterator/SSTableIterator.java index d4362f775b99..ddcbcf929bb4 100644 --- a/src/java/org/apache/cassandra/db/columniterator/SSTableIterator.java +++ b/src/java/org/apache/cassandra/db/columniterator/SSTableIterator.java @@ -48,6 +48,7 @@ public SSTableIterator(SSTableReader sstable, super(sstable, file, key, indexEntry, slices, columns, ifile); } + @SuppressWarnings("resource") // caller to close protected Reader createReaderInternal(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile) { return indexEntry.isIndexed() diff --git a/src/java/org/apache/cassandra/db/columniterator/SSTableReversedIterator.java b/src/java/org/apache/cassandra/db/columniterator/SSTableReversedIterator.java index a60aafa77181..37db6d9754fd 100644 --- a/src/java/org/apache/cassandra/db/columniterator/SSTableReversedIterator.java +++ b/src/java/org/apache/cassandra/db/columniterator/SSTableReversedIterator.java @@ -52,6 +52,7 @@ public SSTableReversedIterator(SSTableReader sstable, super(sstable, file, key, indexEntry, slices, columns, ifile); } + @SuppressWarnings("resource") // caller to close protected Reader createReaderInternal(RowIndexEntry indexEntry, FileDataInput file, boolean shouldCloseFile) { return indexEntry.isIndexed() diff --git a/src/java/org/apache/cassandra/db/marshal/AbstractTimeUUIDType.java b/src/java/org/apache/cassandra/db/marshal/AbstractTimeUUIDType.java index b6b10d582d72..38af81237743 100644 --- a/src/java/org/apache/cassandra/db/marshal/AbstractTimeUUIDType.java +++ b/src/java/org/apache/cassandra/db/marshal/AbstractTimeUUIDType.java @@ -24,9 +24,11 @@ import org.apache.cassandra.cql3.Constants; import org.apache.cassandra.cql3.Term; import org.apache.cassandra.serializers.MarshalException; -import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.serializers.UUIDSerializer; import org.apache.cassandra.utils.TimeUUID; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; import static java.util.concurrent.TimeUnit.MILLISECONDS; import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUIDAsBytes; @@ -44,6 +46,7 @@ public boolean isEmptyValueMeaningless() return true; } + @Override public int compareCustom(VL left, ValueAccessor accessorL, VR right, ValueAccessor accessorR) { // Compare for length @@ -58,12 +61,12 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right long msb1 = accessorL.getLong(left, 0); long msb2 = accessorR.getLong(right, 0); + verifyVersion(msb1); + verifyVersion(msb2); + msb1 = reorderTimestampBytes(msb1); msb2 = reorderTimestampBytes(msb2); - assert (msb1 & topbyte(0xf0L)) == topbyte(0x10L); - assert (msb2 & topbyte(0xf0L)) == topbyte(0x10L); - int c = Long.compare(msb1, msb2); if (c != 0) return c; @@ -75,6 +78,40 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return Long.compare(lsb1, lsb2); } + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, ByteComparable.Version version) + { + if (accessor.isEmpty(data)) + return null; + + long hiBits = accessor.getLong(data, 0); + verifyVersion(hiBits); + ByteBuffer swizzled = ByteBuffer.allocate(16); + swizzled.putLong(0, TimeUUIDType.reorderTimestampBytes(hiBits)); + swizzled.putLong(8, accessor.getLong(data, 8) ^ 0x8080808080808080L); + + return ByteSource.fixedLength(swizzled); + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + // Optional-style encoding of empty values as null sources + if (comparableBytes == null) + return accessor.empty(); + + // The non-lexical UUID bits are stored as an unsigned fixed-length 128-bit integer. + long hiBits = ByteSourceInverse.getUnsignedFixedLengthAsLong(comparableBytes, 8); + long loBits = ByteSourceInverse.getUnsignedFixedLengthAsLong(comparableBytes, 8); + + hiBits = reorderBackTimestampBytes(hiBits); + verifyVersion(hiBits); + // In addition, TimeUUIDType also touches the low bits of the UUID (see CASSANDRA-8730 and DB-1758). + loBits ^= 0x8080808080808080L; + + return UUIDType.makeUuidBytes(accessor, hiBits, loBits); + } + // takes as input 8 signed bytes in native machine order // returns the first byte unchanged, and the following 7 bytes converted to an unsigned representation // which is the same as a 2's complement long in native format @@ -83,16 +120,30 @@ public static long signedBytesToNativeLong(long signedBytes) return signedBytes ^ 0x0080808080808080L; } - private static long topbyte(long topbyte) + private void verifyVersion(long hiBits) { - return topbyte << 56; + long version = (hiBits >>> 12) & 0xF; + if (version != 1) + throw new MarshalException(String.format("Invalid UUID version %d for timeuuid", + version)); } protected static long reorderTimestampBytes(long input) { - return (input << 48) - | ((input << 16) & 0xFFFF00000000L) - | (input >>> 32); + return (input << 48) + | ((input << 16) & 0xFFFF00000000L) + | (input >>> 32); + } + + protected static long reorderBackTimestampBytes(long input) + { + // In a time-based UUID the high bits are significantly more shuffled than in other UUIDs - if [X] represents a + // 16-bit tuple, [1][2][3][4] should become [3][4][2][1]. + // See the UUID Javadoc (and more specifically the high bits layout of a Leach-Salz UUID) to understand the + // reasoning behind this bit twiddling in the first place (in the context of comparisons). + return (input << 32) + | ((input >>> 16) & 0xFFFF0000L) + | (input >>> 48); } public ByteBuffer fromString(String source) throws MarshalException diff --git a/src/java/org/apache/cassandra/db/marshal/AbstractType.java b/src/java/org/apache/cassandra/db/marshal/AbstractType.java index 74d4006664ae..8f54cb65a6e9 100644 --- a/src/java/org/apache/cassandra/db/marshal/AbstractType.java +++ b/src/java/org/apache/cassandra/db/marshal/AbstractType.java @@ -40,6 +40,9 @@ import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; import org.github.jamm.Unmetered; import static org.apache.cassandra.db.marshal.AbstractType.ComparisonType.CUSTOM; @@ -55,6 +58,8 @@ @Unmetered public abstract class AbstractType implements Comparator, AssignmentTestable { + private final static int VARIABLE_LENGTH = -1; + public final Comparator reverseComparator; public enum ComparisonType @@ -449,11 +454,28 @@ public List> getComponents() } /** - * The length of values for this type if all values are of fixed length, -1 otherwise. + * The length of values for this type if all values are of fixed length, -1 otherwise. This has an impact on + * serialization. + * + *

  • see {@link #writeValue}
  • + *
  • see {@link #read}
  • + *
  • see {@link #writtenLength}
  • + *
  • see {@link #skipValue}
  • + * */ public int valueLengthIfFixed() { - return -1; + return VARIABLE_LENGTH; + } + + /** + * Checks if all values are of fixed length. + * + * @return {@code true} if all values are of fixed length, {@code false} otherwise. + */ + public final boolean isValueLengthFixed() + { + return valueLengthIfFixed() != VARIABLE_LENGTH; } // This assumes that no empty values are passed @@ -598,6 +620,69 @@ public AssignmentTestable.TestResult testAssignment(AbstractType receiverType return AssignmentTestable.TestResult.NOT_ASSIGNABLE; } + /** + * Produce a byte-comparable representation of the given value, i.e. a sequence of bytes that compares the same way + * using lexicographical unsigned byte comparison as the original value using the type's comparator. + * + * We use a slightly stronger requirement to be able to use the types in tuples. Precisely, for any pair x, y of + * non-equal valid values of this type and any bytes b1, b2 between 0x10 and 0xEF, + * (+ stands for concatenation) + * compare(x, y) == compareLexicographicallyUnsigned(asByteComparable(x)+b1, asByteComparable(y)+b2) + * (i.e. the values compare like the original type, and an added 0x10-0xEF byte at the end does not change that) and: + * asByteComparable(x)+b1 is not a prefix of asByteComparable(y) (weakly prefix free) + * (i.e. a valid representation of a value may be a prefix of another valid representation of a value only if the + * following byte in the latter is smaller than 0x10 or larger than 0xEF). These properties are trivially true if + * the encoding compares correctly and is prefix free, but also permits a little more freedom that enables somewhat + * more efficient encoding of arbitrary-length byte-comparable blobs. + * + * Depending on the type, this method can be called for null or empty input, in which case the output is allowed to + * be null (the clustering/tuple encoding will accept and handle it). + */ + public ByteSource asComparableBytes(ValueAccessor accessor, V value, ByteComparable.Version version) + { + if (isByteOrderComparable) + { + // When a type is byte-ordered on its own, we only need to escape it, so that we can include it in + // multi-component types and make the encoding weakly-prefix-free. + return ByteSource.of(accessor, value, version); + } + else + // default is only good for byte-comparables + throw new UnsupportedOperationException(getClass().getSimpleName() + " does not implement asComparableBytes"); + } + + public final ByteSource asComparableBytes(ByteBuffer byteBuffer, ByteComparable.Version version) + { + return asComparableBytes(ByteBufferAccessor.instance, byteBuffer, version); + } + + /** + * Translates the given byte-ordered representation to the common, non-byte-ordered binary representation of a + * payload for this abstract type (the latter, common binary representation is what we mostly work with in the + * storage engine internals). If the given bytes don't correspond to the encoding of some payload value for this + * abstract type, an {@link IllegalArgumentException} may be thrown. + * + * @param accessor value accessor used to construct the value. + * @param comparableBytes A byte-ordered representation (presumably of a payload for this abstract type). + * @param version The byte-comparable version used to construct the representation. + * @return A of a payload for this abstract type, corresponding to the given byte-ordered representation, + * constructed using the supplied value accessor. + * + * @see #asComparableBytes + */ + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + if (isByteOrderComparable) + return accessor.valueOf(ByteSourceInverse.getUnescapedBytes(comparableBytes)); + else + throw new UnsupportedOperationException(getClass().getSimpleName() + " does not implement fromComparableBytes"); + } + + public final ByteBuffer fromComparableBytes(ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + return fromComparableBytes(ByteBufferAccessor.instance, comparableBytes, version); + } + /** * This must be overriden by subclasses if necessary so that for any * AbstractType, this == TypeParser.parse(toString()). diff --git a/src/java/org/apache/cassandra/db/marshal/BooleanType.java b/src/java/org/apache/cassandra/db/marshal/BooleanType.java index 4ef5f95b0bfc..d144f4ee4d91 100644 --- a/src/java/org/apache/cassandra/db/marshal/BooleanType.java +++ b/src/java/org/apache/cassandra/db/marshal/BooleanType.java @@ -26,14 +26,11 @@ import org.apache.cassandra.serializers.BooleanSerializer; import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; public class BooleanType extends AbstractType { - private static final Logger logger = LoggerFactory.getLogger(BooleanType.class); - public static final BooleanType instance = new BooleanType(); BooleanType() {super(ComparisonType.CUSTOM);} // singleton @@ -54,6 +51,26 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return v1 - v2; } + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, ByteComparable.Version version) + { + if (accessor.isEmpty(data)) + return null; + byte b = accessor.toByte(data); + if (b != 0) + b = 1; + return ByteSource.oneByte(b); + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + if (comparableBytes == null) + return accessor.empty(); + int b = comparableBytes.next(); + return accessor.valueOf(b == 1); + } + public ByteBuffer fromString(String source) throws MarshalException { diff --git a/src/java/org/apache/cassandra/db/marshal/ByteArrayAccessor.java b/src/java/org/apache/cassandra/db/marshal/ByteArrayAccessor.java index df24a627a41c..d7108992dac8 100644 --- a/src/java/org/apache/cassandra/db/marshal/ByteArrayAccessor.java +++ b/src/java/org/apache/cassandra/db/marshal/ByteArrayAccessor.java @@ -248,6 +248,13 @@ public Ballot toBallot(byte[] value) return Ballot.deserialize(value); } + @Override + public int putByte(byte[] dst, int offset, byte value) + { + dst[offset] = value; + return TypeSizes.BYTE_SIZE; + } + @Override public int putShort(byte[] dst, int offset, short value) { diff --git a/src/java/org/apache/cassandra/db/marshal/ByteArrayObjectFactory.java b/src/java/org/apache/cassandra/db/marshal/ByteArrayObjectFactory.java index ea9bf113833b..9b477aeeeace 100644 --- a/src/java/org/apache/cassandra/db/marshal/ByteArrayObjectFactory.java +++ b/src/java/org/apache/cassandra/db/marshal/ByteArrayObjectFactory.java @@ -18,6 +18,7 @@ package org.apache.cassandra.db.marshal; +import org.apache.cassandra.db.AbstractArrayClusteringPrefix; import org.apache.cassandra.db.ArrayClustering; import org.apache.cassandra.db.ArrayClusteringBound; import org.apache.cassandra.db.ArrayClusteringBoundary; @@ -33,7 +34,7 @@ class ByteArrayObjectFactory implements ValueAccessor.ObjectFactory { - private static final Clustering EMPTY_CLUSTERING = new ArrayClustering() + private static final Clustering EMPTY_CLUSTERING = new ArrayClustering(AbstractArrayClusteringPrefix.EMPTY_VALUES_ARRAY) { public String toString(TableMetadata metadata) { @@ -41,14 +42,37 @@ public String toString(TableMetadata metadata) } }; + public static final Clustering STATIC_CLUSTERING = new ArrayClustering(AbstractArrayClusteringPrefix.EMPTY_VALUES_ARRAY) + { + @Override + public Kind kind() + { + return Kind.STATIC_CLUSTERING; + } + + @Override + public String toString() + { + return "STATIC"; + } + + @Override + public String toString(TableMetadata metadata) + { + return toString(); + } + }; + static final ValueAccessor.ObjectFactory instance = new ByteArrayObjectFactory(); private ByteArrayObjectFactory() {} /** The smallest start bound, i.e. the one that starts before any row. */ - private static final ArrayClusteringBound BOTTOM_BOUND = new ArrayClusteringBound(ClusteringPrefix.Kind.INCL_START_BOUND, new byte[0][]); + private static final ArrayClusteringBound BOTTOM_BOUND = new ArrayClusteringBound(ClusteringPrefix.Kind.INCL_START_BOUND, + AbstractArrayClusteringPrefix.EMPTY_VALUES_ARRAY); /** The biggest end bound, i.e. the one that ends after any row. */ - private static final ArrayClusteringBound TOP_BOUND = new ArrayClusteringBound(ClusteringPrefix.Kind.INCL_END_BOUND, new byte[0][]); + private static final ArrayClusteringBound TOP_BOUND = new ArrayClusteringBound(ClusteringPrefix.Kind.INCL_END_BOUND, + AbstractArrayClusteringPrefix.EMPTY_VALUES_ARRAY); public Cell cell(ColumnMetadata column, long timestamp, int ttl, int localDeletionTime, byte[] value, CellPath path) { @@ -65,6 +89,11 @@ public Clustering clustering() return EMPTY_CLUSTERING; } + public Clustering staticClustering() + { + return STATIC_CLUSTERING; + } + public ClusteringBound bound(ClusteringPrefix.Kind kind, byte[]... values) { return new ArrayClusteringBound(kind, values); diff --git a/src/java/org/apache/cassandra/db/marshal/ByteBufferAccessor.java b/src/java/org/apache/cassandra/db/marshal/ByteBufferAccessor.java index 40a3bf4b34ce..0712930c3a82 100644 --- a/src/java/org/apache/cassandra/db/marshal/ByteBufferAccessor.java +++ b/src/java/org/apache/cassandra/db/marshal/ByteBufferAccessor.java @@ -252,6 +252,13 @@ public Ballot toBallot(ByteBuffer value) return Ballot.deserialize(value); } + @Override + public int putByte(ByteBuffer dst, int offset, byte value) + { + dst.put(dst.position() + offset, value); + return TypeSizes.BYTE_SIZE; + } + @Override public int putShort(ByteBuffer dst, int offset, short value) { diff --git a/src/java/org/apache/cassandra/db/marshal/ByteBufferObjectFactory.java b/src/java/org/apache/cassandra/db/marshal/ByteBufferObjectFactory.java index 00f4646341d9..0ac3db926553 100644 --- a/src/java/org/apache/cassandra/db/marshal/ByteBufferObjectFactory.java +++ b/src/java/org/apache/cassandra/db/marshal/ByteBufferObjectFactory.java @@ -20,6 +20,7 @@ import java.nio.ByteBuffer; +import org.apache.cassandra.db.AbstractBufferClusteringPrefix; import org.apache.cassandra.db.BufferClustering; import org.apache.cassandra.db.BufferClusteringBound; import org.apache.cassandra.db.BufferClusteringBoundary; @@ -31,24 +32,15 @@ import org.apache.cassandra.db.rows.Cell; import org.apache.cassandra.db.rows.CellPath; import org.apache.cassandra.schema.ColumnMetadata; -import org.apache.cassandra.schema.TableMetadata; class ByteBufferObjectFactory implements ValueAccessor.ObjectFactory { - /** Empty clustering for tables having no clustering columns. */ - private static final Clustering EMPTY_CLUSTERING = new BufferClustering() - { - @Override - public String toString(TableMetadata metadata) - { - return "EMPTY"; - } - }; - /** The smallest start bound, i.e. the one that starts before any row. */ - private static final BufferClusteringBound BOTTOM_BOUND = new BufferClusteringBound(ClusteringPrefix.Kind.INCL_START_BOUND, new ByteBuffer[0]); + private static final BufferClusteringBound BOTTOM_BOUND = new BufferClusteringBound(ClusteringPrefix.Kind.INCL_START_BOUND, + AbstractBufferClusteringPrefix.EMPTY_VALUES_ARRAY); /** The biggest end bound, i.e. the one that ends after any row. */ - private static final BufferClusteringBound TOP_BOUND = new BufferClusteringBound(ClusteringPrefix.Kind.INCL_END_BOUND, new ByteBuffer[0]); + private static final BufferClusteringBound TOP_BOUND = new BufferClusteringBound(ClusteringPrefix.Kind.INCL_END_BOUND, + AbstractBufferClusteringPrefix.EMPTY_VALUES_ARRAY); static final ValueAccessor.ObjectFactory instance = new ByteBufferObjectFactory(); @@ -66,7 +58,12 @@ public Clustering clustering(ByteBuffer... values) public Clustering clustering() { - return EMPTY_CLUSTERING; + return Clustering.EMPTY; + } + + public Clustering staticClustering() + { + return Clustering.STATIC_CLUSTERING; } public ClusteringBound bound(ClusteringPrefix.Kind kind, ByteBuffer... values) diff --git a/src/java/org/apache/cassandra/db/marshal/ByteType.java b/src/java/org/apache/cassandra/db/marshal/ByteType.java index f94f4bb01cc5..a910fbba11a1 100644 --- a/src/java/org/apache/cassandra/db/marshal/ByteType.java +++ b/src/java/org/apache/cassandra/db/marshal/ByteType.java @@ -27,6 +27,10 @@ import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; public class ByteType extends NumberType { @@ -42,6 +46,19 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return accessorL.getByte(left, 0) - accessorR.getByte(right, 0); } + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, Version version) + { + // This type does not allow non-present values, but we do just to avoid future complexity. + return ByteSource.optionalSignedFixedLengthNumber(accessor, data); + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + return ByteSourceInverse.getOptionalSignedFixedLength(accessor, comparableBytes, 1); + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/CollectionType.java b/src/java/org/apache/cassandra/db/marshal/CollectionType.java index c52cddc07f66..5e9916e7e4a2 100644 --- a/src/java/org/apache/cassandra/db/marshal/CollectionType.java +++ b/src/java/org/apache/cassandra/db/marshal/CollectionType.java @@ -19,6 +19,7 @@ import java.nio.ByteBuffer; import java.io.IOException; +import java.util.ArrayList; import java.util.List; import java.util.Iterator; @@ -27,6 +28,7 @@ import org.apache.cassandra.cql3.Lists; import org.apache.cassandra.cql3.Maps; import org.apache.cassandra.cql3.Sets; +import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.db.rows.Cell; import org.apache.cassandra.db.rows.CellPath; import org.apache.cassandra.io.util.DataInputPlus; @@ -35,6 +37,9 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; /** * The abstract validator that is the base for maps, sets and lists (both frozen and non-frozen). @@ -245,6 +250,91 @@ public String toString() return this.toString(false); } + static int compareListOrSet(AbstractType elementsComparator, VL left, ValueAccessor accessorL, VR right, ValueAccessor accessorR) + { + // Note that this is only used if the collection is frozen + if (accessorL.isEmpty(left) || accessorR.isEmpty(right)) + return Boolean.compare(accessorR.isEmpty(right), accessorL.isEmpty(left)); + + int sizeL = CollectionSerializer.readCollectionSize(left, accessorL, ProtocolVersion.V3); + int offsetL = CollectionSerializer.sizeOfCollectionSize(sizeL, ProtocolVersion.V3); + int sizeR = CollectionSerializer.readCollectionSize(right, accessorR, ProtocolVersion.V3); + int offsetR = TypeSizes.INT_SIZE; + + for (int i = 0; i < Math.min(sizeL, sizeR); i++) + { + VL v1 = CollectionSerializer.readValue(left, accessorL, offsetL, ProtocolVersion.V3); + offsetL += CollectionSerializer.sizeOfValue(v1, accessorL, ProtocolVersion.V3); + VR v2 = CollectionSerializer.readValue(right, accessorR, offsetR, ProtocolVersion.V3); + offsetR += CollectionSerializer.sizeOfValue(v2, accessorR, ProtocolVersion.V3); + int cmp = elementsComparator.compare(v1, accessorL, v2, accessorR); + if (cmp != 0) + return cmp; + } + + return Integer.compare(sizeL, sizeR); + } + + static ByteSource asComparableBytesListOrSet(AbstractType elementsComparator, + ValueAccessor accessor, + V data, + ByteComparable.Version version) + { + if (accessor.isEmpty(data)) + return null; + + int offset = 0; + int size = CollectionSerializer.readCollectionSize(data, accessor, ProtocolVersion.V3); + offset += CollectionSerializer.sizeOfCollectionSize(size, ProtocolVersion.V3); + ByteSource[] srcs = new ByteSource[size]; + for (int i = 0; i < size; ++i) + { + V v = CollectionSerializer.readValue(data, accessor, offset, ProtocolVersion.V3); + offset += CollectionSerializer.sizeOfValue(v, accessor, ProtocolVersion.V3); + srcs[i] = elementsComparator.asComparableBytes(accessor, v, version); + } + return ByteSource.withTerminatorMaybeLegacy(version, 0x00, srcs); + } + + static V fromComparableBytesListOrSet(ValueAccessor accessor, + ByteSource.Peekable comparableBytes, + ByteComparable.Version version, + AbstractType elementType) + { + if (comparableBytes == null) + return accessor.empty(); + assert version != ByteComparable.Version.LEGACY; // legacy translation is not reversible + + List buffers = new ArrayList<>(); + int separator = comparableBytes.next(); + while (separator != ByteSource.TERMINATOR) + { + if (!ByteSourceInverse.nextComponentNull(separator)) + buffers.add(elementType.fromComparableBytes(accessor, comparableBytes, version)); + else + buffers.add(null); + separator = comparableBytes.next(); + } + return CollectionSerializer.pack(buffers, accessor, buffers.size(), ProtocolVersion.V3); + } + + public static String setOrListToJsonString(ByteBuffer buffer, AbstractType elementsType, ProtocolVersion protocolVersion) + { + ByteBuffer value = buffer.duplicate(); + StringBuilder sb = new StringBuilder("["); + int size = CollectionSerializer.readCollectionSize(value, ByteBufferAccessor.instance, protocolVersion); + int offset = CollectionSerializer.sizeOfCollectionSize(size, protocolVersion); + for (int i = 0; i < size; i++) + { + if (i > 0) + sb.append(", "); + ByteBuffer element = CollectionSerializer.readValue(value, ByteBufferAccessor.instance, offset, protocolVersion); + offset += CollectionSerializer.sizeOfValue(element, ByteBufferAccessor.instance, protocolVersion); + sb.append(elementsType.toJSONString(element, protocolVersion)); + } + return sb.append("]").toString(); + } + private static class CollectionPathSerializer implements CellPath.Serializer { public void serialize(CellPath path, DataOutputPlus out) throws IOException diff --git a/src/java/org/apache/cassandra/db/marshal/CompositeType.java b/src/java/org/apache/cassandra/db/marshal/CompositeType.java index bf5e914a9d9e..00cbeb58986c 100644 --- a/src/java/org/apache/cassandra/db/marshal/CompositeType.java +++ b/src/java/org/apache/cassandra/db/marshal/CompositeType.java @@ -24,6 +24,7 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; +import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; import com.google.common.collect.Lists; @@ -31,6 +32,9 @@ import org.apache.cassandra.exceptions.SyntaxException; import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; import static com.google.common.collect.Iterables.any; import static com.google.common.collect.Iterables.transform; @@ -165,6 +169,86 @@ protected AbstractType getAndAppendComparator(int i, V value, ValueAccess return types.get(i); } + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, Version version) + { + if (data == null || accessor.isEmpty(data)) + return null; + + ByteSource[] srcs = new ByteSource[types.size() * 2 + 1]; + int length = accessor.size(data); + + // statics go first + boolean isStatic = readIsStaticInternal(data, accessor); + int offset = startingOffsetInternal(isStatic); + srcs[0] = isStatic ? null : ByteSource.EMPTY; + + int i = 0; + byte lastEoc = 0; + while (offset < length) + { + // Only the end-of-component byte of the last component of this composite can be non-zero, so the + // component before can't have a non-zero end-of-component byte. + assert lastEoc == 0 : lastEoc; + + int componentLength = accessor.getUnsignedShort(data, offset); + offset += 2; + srcs[i * 2 + 1] = types.get(i).asComparableBytes(accessor, accessor.slice(data, offset, componentLength), version); + offset += componentLength; + lastEoc = accessor.getByte(data, offset); + offset += 1; + srcs[i * 2 + 2] = ByteSource.oneByte(lastEoc & 0xFF ^ 0x80); // end-of-component also takes part in comparison as signed byte + ++i; + } + // A composite may be leaving some values unspecified. If this is the case, make sure we terminate early + // so that translations created before an extra field was added match translations that have the field but don't + // specify a value for it. + if (i * 2 + 1 < srcs.length) + srcs = Arrays.copyOfRange(srcs, 0, i * 2 + 1); + + return ByteSource.withTerminatorMaybeLegacy(version, ByteSource.END_OF_STREAM, srcs); + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, Version version) + { + // For ByteComparable.Version.LEGACY the terminator byte is ByteSource.END_OF_STREAM. The latter means that it's + // indistinguishable from the END_OF_STREAM byte that gets returned _after_ the terminator byte has already + // been consumed, when the composite is part of a multi-component sequence. So if in such a scenario we consume + // the ByteSource.END_OF_STREAM terminator here, this will result in actually consuming the multi-component + // sequence separator after it and jumping directly into the bytes of the next component, when we try to + // consume the (already consumed) separator. + // Instead of trying to find a way around the situation, we can just take advantage of the fact that we don't + // need to decode from Version.LEGACY, assume that we never do that, and assert it here. + assert version != Version.LEGACY; + + if (comparableBytes == null) + return accessor.empty(); + + int separator = comparableBytes.next(); + boolean isStatic = ByteSourceInverse.nextComponentNull(separator); + int i = 0; + V[] buffers = accessor.createArray(types.size()); + byte lastEoc = 0; + + while ((separator = comparableBytes.next()) != ByteSource.TERMINATOR && i < types.size()) + { + // Only the end-of-component byte of the last component of this composite can be non-zero, so the + // component before can't have a non-zero end-of-component byte. + assert lastEoc == 0 : lastEoc; + + // Get the next type and decode its payload. + AbstractType type = types.get(i); + V decoded = type.fromComparableBytes(accessor, + ByteSourceInverse.nextComponentSource(comparableBytes, separator), + version); + buffers[i++] = decoded; + + lastEoc = ByteSourceInverse.getSignedByte(ByteSourceInverse.nextComponentSource(comparableBytes)); + } + return build(accessor, isStatic, Arrays.copyOf(buffers, i), lastEoc); + } + protected ParsedComparator parseComparator(int i, String part) { return new StaticParsedComparator(types.get(i), part); @@ -370,6 +454,12 @@ public static V build(ValueAccessor accessor, V... values) @SafeVarargs public static V build(ValueAccessor accessor, boolean isStatic, V... values) + { + return build(accessor, isStatic, values, (byte) 0); + } + + @VisibleForTesting + public static V build(ValueAccessor accessor, boolean isStatic, V[] values, byte lastEoc) { int totalLength = isStatic ? 2 : 0; for (V v : values) @@ -380,11 +470,12 @@ public static V build(ValueAccessor accessor, boolean isStatic, V... valu if (isStatic) out.putShort((short)STATIC_MARKER); - for (V v : values) + for (int i = 0; i < values.length; ++i) { + V v = values[i]; ByteBufferUtil.writeShortLength(out, accessor.size(v)); accessor.write(v, out); - out.put((byte) 0); + out.put(i != values.length - 1 ? (byte) 0 : lastEoc); } out.flip(); return accessor.valueOf(out); diff --git a/src/java/org/apache/cassandra/db/marshal/DateType.java b/src/java/org/apache/cassandra/db/marshal/DateType.java index 473cedf40795..595106d3d184 100644 --- a/src/java/org/apache/cassandra/db/marshal/DateType.java +++ b/src/java/org/apache/cassandra/db/marshal/DateType.java @@ -31,6 +31,9 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; /** * This is the old version of TimestampType, but has been replaced as it wasn't comparing pre-epoch timestamps @@ -50,6 +53,19 @@ public boolean isEmptyValueMeaningless() return true; } + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, ByteComparable.Version version) + { + // While BYTE_ORDER would still work for this type, making use of the fixed length is more efficient. + return ByteSource.optionalFixedLength(accessor, data); + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + return ByteSourceInverse.getOptionalFixedLength(accessor, comparableBytes, 8); + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/DecimalType.java b/src/java/org/apache/cassandra/db/marshal/DecimalType.java index 5740fdcc0fcb..3e02dc9696f2 100644 --- a/src/java/org/apache/cassandra/db/marshal/DecimalType.java +++ b/src/java/org/apache/cassandra/db/marshal/DecimalType.java @@ -24,6 +24,8 @@ import java.nio.ByteBuffer; import java.util.Objects; +import com.google.common.primitives.Ints; + import org.apache.cassandra.cql3.CQL3Type; import org.apache.cassandra.cql3.Constants; import org.apache.cassandra.cql3.Term; @@ -32,6 +34,8 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; public class DecimalType extends NumberType { @@ -41,6 +45,16 @@ public class DecimalType extends NumberType private static final int MAX_SCALE = 1000; private static final MathContext MAX_PRECISION = new MathContext(10000); + // Constants or escaping values needed to encode/decode variable-length floating point numbers (decimals) in our + // custom byte-ordered encoding scheme. + private static final int POSITIVE_DECIMAL_HEADER_MASK = 0x80; + private static final int NEGATIVE_DECIMAL_HEADER_MASK = 0x00; + private static final int DECIMAL_EXPONENT_LENGTH_HEADER_MASK = 0x40; + private static final byte DECIMAL_LAST_BYTE = (byte) 0x00; + private static final BigInteger HUNDRED = BigInteger.valueOf(100); + + private static final ByteBuffer ZERO_BUFFER = instance.decompose(BigDecimal.ZERO); + DecimalType() {super(ComparisonType.CUSTOM);} // singleton public boolean isEmptyValueMeaningless() @@ -59,6 +73,196 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return compareComposed(left, accessorL, right, accessorR, this); } + /** + * Constructs a byte-comparable representation. + * This is rather difficult and involves reconstructing the decimal. + * + * To compare, we need a normalized value, i.e. one with a sign, exponent and (0,1) mantissa. To avoid + * loss of precision, both exponent and mantissa need to be base-100. We can't get this directly off the serialized + * bytes, as they have base-10 scale and base-256 unscaled part. + * + * We store: + * - sign bit inverted * 0x80 + 0x40 + signed exponent length, where exponent is negated if value is negative + * - zero or more exponent bytes (as given by length) + * - 0x80 + first pair of decimal digits, negative if value is negative, rounded to -inf + * - zero or more 0x80 + pair of decimal digits, always positive + * - trailing 0x00 + * Zero is special-cased as 0x80. + * + * Because the trailing 00 cannot be produced from a pair of decimal digits (positive or not), no value can be + * a prefix of another. + * + * Encoding examples: + * 1.1 as c1 = 0x80 (positive number) + 0x40 + (positive exponent) 0x01 (exp length 1) + * 01 = exponent 1 (100^1) + * 81 = 0x80 + 01 (0.01) + * 8a = 0x80 + 10 (....10) 0.0110e2 + * 00 + * -1 as 3f = 0x00 (negative number) + 0x40 - (negative exponent) 0x01 (exp length 1) + * ff = exponent -1. negative number, thus 100^1 + * 7f = 0x80 - 01 (-0.01) -0.01e2 + * 00 + * -99.9 as 3f = 0x00 (negative number) + 0x40 - (negative exponent) 0x01 (exp length 1) + * ff = exponent -1. negative number, thus 100^1 + * 1c = 0x80 - 100 (-1.00) + * 8a = 0x80 + 10 (+....10) -0.999e2 + * 00 + * + */ + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, ByteComparable.Version version) + { + BigDecimal value = compose(data, accessor); + if (value == null) + return null; + if (value.compareTo(BigDecimal.ZERO) == 0) // Note: 0.equals(0.0) returns false! + return ByteSource.oneByte(POSITIVE_DECIMAL_HEADER_MASK); + + long scale = (((long) value.scale()) - value.precision()) & ~1; + boolean negative = value.signum() < 0; + // Make a base-100 exponent (this will always fit in an int). + int exponent = Math.toIntExact(-scale >> 1); + // Flip the exponent sign for negative numbers, so that ones with larger magnitudes are propely treated as smaller. + final int modulatedExponent = negative ? -exponent : exponent; + // We should never have scale > Integer.MAX_VALUE, as we're always subtracting the non-negative precision of + // the encoded BigDecimal, and furthermore we're rounding to negative infinity. + assert scale <= Integer.MAX_VALUE; + // However, we may end up overflowing on the negative side. + if (scale < Integer.MIN_VALUE) + { + // As scaleByPowerOfTen needs an int scale, do the scaling in two steps. + int mv = Integer.MIN_VALUE; + value = value.scaleByPowerOfTen(mv); + scale -= mv; + } + final BigDecimal mantissa = value.scaleByPowerOfTen(Ints.checkedCast(scale)).stripTrailingZeros(); + // We now have a smaller-than-one signed mantissa, and a signed and modulated base-100 exponent. + assert mantissa.abs().compareTo(BigDecimal.ONE) < 0; + + return new ByteSource() + { + // Start with up to 5 bytes for sign + exponent. + int exponentBytesLeft = 5; + BigDecimal current = mantissa; + + @Override + public int next() + { + if (exponentBytesLeft > 0) + { + --exponentBytesLeft; + if (exponentBytesLeft == 4) + { + // Skip leading zero bytes in the modulatedExponent. + exponentBytesLeft -= Integer.numberOfLeadingZeros(Math.abs(modulatedExponent)) / 8; + // Now prepare the leading byte which includes the sign of the number plus the sign and length of the modulatedExponent. + int explen = DECIMAL_EXPONENT_LENGTH_HEADER_MASK + (modulatedExponent < 0 ? -exponentBytesLeft : exponentBytesLeft); + return explen + (negative ? NEGATIVE_DECIMAL_HEADER_MASK : POSITIVE_DECIMAL_HEADER_MASK); + } + else + return (modulatedExponent >> (exponentBytesLeft * 8)) & 0xFF; + } + else if (current == null) + { + return END_OF_STREAM; + } + else if (current.compareTo(BigDecimal.ZERO) == 0) + { + current = null; + return 0x00; + } + else + { + BigDecimal v = current.scaleByPowerOfTen(2); + BigDecimal floor = v.setScale(0, RoundingMode.FLOOR); + current = v.subtract(floor); + return floor.byteValueExact() + 0x80; + } + } + }; + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + if (comparableBytes == null) + return accessor.empty(); + + int headerBits = comparableBytes.next(); + if (headerBits == POSITIVE_DECIMAL_HEADER_MASK) + return accessor.valueOf(ZERO_BUFFER); + + // I. Extract the exponent. + // The sign of the decimal, and the sign and the length (in bytes) of the decimal exponent, are all encoded in + // the first byte. + // Get the sign of the decimal... + boolean isNegative = headerBits < POSITIVE_DECIMAL_HEADER_MASK; + headerBits -= isNegative ? NEGATIVE_DECIMAL_HEADER_MASK : POSITIVE_DECIMAL_HEADER_MASK; + headerBits -= DECIMAL_EXPONENT_LENGTH_HEADER_MASK; + // Get the sign and the length of the exponent (the latter is encoded as its negative if the sign of the + // exponent is negative)... + boolean isExponentNegative = headerBits < 0; + headerBits = isExponentNegative ? -headerBits : headerBits; + // Now consume the exponent bytes. If the exponent is negative and uses less than 4 bytes, the remaining bytes + // should be padded with 1s, in order for the constructed int to contain the correct (negative) exponent value. + // So, if the exponent is negative, we can just start with all bits set to 1 (i.e. we can start with -1). + int exponent = isExponentNegative ? -1 : 0; + for (int i = 0; i < headerBits; ++i) + exponent = (exponent << 8) | comparableBytes.next(); + // The encoded exponent also contains the decimal sign, in order to correctly compare exponents in case of + // negative decimals (e.g. x * 10^y > x * 10^z if x < 0 && y < z). After the decimal sign is "removed", what's + // left is a base-100 exponent following BigDecimal's convention for the exponent sign. + exponent = isNegative ? -exponent : exponent; + + // II. Extract the mantissa as a BigInteger value. It was encoded as a BigDecimal value between 0 and 1, in + // order to be used for comparison (after the sign of the decimal and the sign and the value of the exponent), + // but when decoding we don't need that property on the transient mantissa value. + BigInteger mantissa = BigInteger.ZERO; + int curr = comparableBytes.next(); + while (curr != DECIMAL_LAST_BYTE) + { + // The mantissa value is constructed by a standard positional notation value calculation. + // The value of the next digit is the next most-significant mantissa byte as an unsigned integer, + // offset by a predetermined value (in this case, 0x80)... + int currModified = curr - 0x80; + // ...multiply the current value by the base (in this case, 100)... + mantissa = mantissa.multiply(HUNDRED); + // ...then add the next digit to the modified current value... + mantissa = mantissa.add(BigInteger.valueOf(currModified)); + // ...and finally, adjust the base-100, BigDecimal format exponent accordingly. + --exponent; + curr = comparableBytes.next(); + } + + // III. Construct the final BigDecimal value, by combining the mantissa and the exponent, guarding against + // underflow or overflow when exponents are close to their boundary values. + long base10NonBigDecimalFormatExp = 2L * exponent; + // When expressing a sufficiently big decimal, BigDecimal's internal scale value will be negative with very + // big absolute value. To compute the encoded exponent, this internal scale has the number of digits of the + // unscaled value subtracted from it, after which it's divided by 2, rounding down to negative infinity + // (before accounting for the decimal sign). When decoding, this exponent is converted to a base-10 exponent in + // non-BigDecimal format, which means that it can very well overflow Integer.MAX_VALUE. + // For example, see how new BigDecimal(BigInteger.TEN, Integer.MIN_VALUE) is encoded and decoded. + if (base10NonBigDecimalFormatExp > Integer.MAX_VALUE) + { + // If the base-10 exponent will result in an overflow, some of its powers of 10 need to be absorbed by the + // mantissa. How much exactly? As little as needed, in order to avoid complex BigInteger operations, which + // means exactly as much as to have a scale of -Integer.MAX_VALUE. + int exponentReduction = (int) (base10NonBigDecimalFormatExp - Integer.MAX_VALUE); + mantissa = mantissa.multiply(BigInteger.TEN.pow(exponentReduction)); + base10NonBigDecimalFormatExp = Integer.MAX_VALUE; + } + assert base10NonBigDecimalFormatExp >= Integer.MIN_VALUE && base10NonBigDecimalFormatExp <= Integer.MAX_VALUE; + // Here we negate the exponent, as we are not using BigDecimal.scaleByPowerOfTen, where a positive number means + // "multiplying by a positive power of 10", but to BigDecimal's internal scale representation, where a positive + // number means "dividing by a positive power of 10". + byte[] mantissaBytes = mantissa.toByteArray(); + V resultBuf = accessor.allocate(4 + mantissaBytes.length); + accessor.putInt(resultBuf, 0, (int) -base10NonBigDecimalFormatExp); + accessor.copyByteArrayTo(mantissaBytes, 0, resultBuf, 4, mantissaBytes.length); + return resultBuf; + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/DoubleType.java b/src/java/org/apache/cassandra/db/marshal/DoubleType.java index 570d420a75bb..56ae0131b3a9 100644 --- a/src/java/org/apache/cassandra/db/marshal/DoubleType.java +++ b/src/java/org/apache/cassandra/db/marshal/DoubleType.java @@ -27,6 +27,9 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; public class DoubleType extends NumberType { @@ -50,6 +53,18 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return compareComposed(left, accessorL, right, accessorR, this); } + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, ByteComparable.Version version) + { + return ByteSource.optionalSignedFixedLengthFloat(accessor, data); + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + return ByteSourceInverse.getOptionalSignedFixedLengthFloat(accessor, comparableBytes, 8); + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java b/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java index 5df36009956e..e7a2360fa990 100644 --- a/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java +++ b/src/java/org/apache/cassandra/db/marshal/DynamicCompositeType.java @@ -19,9 +19,16 @@ import java.nio.ByteBuffer; import java.nio.charset.CharacterCodingException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; +import com.google.common.annotations.VisibleForTesting; +import com.google.common.collect.Lists; import com.google.common.collect.Maps; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -34,6 +41,9 @@ import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; import static com.google.common.collect.Iterables.any; @@ -60,7 +70,11 @@ public class DynamicCompositeType extends AbstractCompositeType { private static final Logger logger = LoggerFactory.getLogger(DynamicCompositeType.class); + private static final ByteSource[] EMPTY_BYTE_SOURCE_ARRAY = new ByteSource[0]; + private static final String REVERSED_TYPE = ReversedType.class.getSimpleName(); + private final Map> aliases; + private final Map, Byte> inverseMapping; // interning instances private static final ConcurrentHashMap>, DynamicCompositeType> instances = new ConcurrentHashMap<>(); @@ -81,6 +95,9 @@ public static DynamicCompositeType getInstance(Map> aliase private DynamicCompositeType(Map> aliases) { this.aliases = aliases; + this.inverseMapping = new HashMap<>(); + for (Map.Entry> en : aliases.entrySet()) + this.inverseMapping.put(en.getValue(), en.getKey()); } protected boolean readIsStatic(V value, ValueAccessor accessor) @@ -197,6 +214,196 @@ protected AbstractType getAndAppendComparator(int i, V value, ValueAccess } } + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, Version version) + { + List srcs = new ArrayList<>(); + int length = accessor.size(data); + + // statics go first + boolean isStatic = readIsStatic(data, accessor); + int offset = startingOffset(isStatic); + srcs.add(isStatic ? null : ByteSource.EMPTY); + + byte lastEoc = 0; + int i = 0; + while (offset < length) + { + // Only the end-of-component byte of the last component of this composite can be non-zero, so the + // component before can't have a non-zero end-of-component byte. + assert lastEoc == 0 : lastEoc; + + AbstractType comp = getComparator(data, accessor, offset); + offset += getComparatorSize(i, data, accessor, offset); + // The comparable bytes for the component need to ensure comparisons consistent with + // AbstractCompositeType.compareCustom(ByteBuffer, ByteBuffer) and + // DynamicCompositeType.getComparator(int, ByteBuffer, ByteBuffer): + if (version == Version.LEGACY || !(comp instanceof ReversedType)) + { + // ...most often that means just adding the short name of the type, followed by the full name of the type. + srcs.add(ByteSource.of(comp.getClass().getSimpleName(), version)); + srcs.add(ByteSource.of(comp.getClass().getName(), version)); + } + else + { + // ...however some times the component uses a complex type (currently the only supported complex type + // is ReversedType - we can't have elements that are of MapType, CompositeType, TupleType, etc.)... + ReversedType reversedComp = (ReversedType) comp; + // ...in this case, we need to add the short name of ReversedType before the short name of the base + // type, to ensure consistency with DynamicCompositeType.getComparator(int, ByteBuffer, ByteBuffer). + srcs.add(ByteSource.of(REVERSED_TYPE, version)); + srcs.add(ByteSource.of(reversedComp.baseType.getClass().getSimpleName(), version)); + srcs.add(ByteSource.of(reversedComp.baseType.getClass().getName(), version)); + } + // Only then the payload of the component gets encoded. + int componentLength = accessor.getUnsignedShort(data, offset); + offset += 2; + srcs.add(comp.asComparableBytes(accessor, accessor.slice(data, offset, componentLength), version)); + offset += componentLength; + // The end-of-component byte also takes part in the comparison, and therefore needs to be encoded. + lastEoc = accessor.getByte(data, offset); + offset += 1; + srcs.add(ByteSource.oneByte(version == Version.LEGACY ? lastEoc : lastEoc & 0xFF ^ 0x80)); + ++i; + } + + return ByteSource.withTerminatorMaybeLegacy(version, ByteSource.END_OF_STREAM, srcs.toArray(EMPTY_BYTE_SOURCE_ARRAY)); + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, Version version) + { + // For ByteComparable.Version.LEGACY the terminator byte is ByteSource.END_OF_STREAM. Just like with + // CompositeType, this means that in multi-component sequences the terminator may be transformed to a regular + // component separator, but unlike CompositeType (where we have the expected number of types/components), + // this can make the end of the whole dynamic composite type indistinguishable from the end of a component + // somewhere in the middle of the dynamic composite type. Because of that, DynamicCompositeType elements + // cannot always be safely decoded using that encoding version. + // Even more so than with CompositeType, we just take advantage of the fact that we don't need to decode from + // Version.LEGACY, assume that we never do that, and assert it here. + assert version != Version.LEGACY; + + if (comparableBytes == null) + return accessor.empty(); + + // The first byte is the isStatic flag which we don't need but must consume to continue past it. + comparableBytes.next(); + + List> types = new ArrayList<>(); + List values = new ArrayList<>(); + byte lastEoc = 0; + + for (int separator = comparableBytes.next(); separator != ByteSource.TERMINATOR; separator = comparableBytes.next()) + { + // Solely the end-of-component byte of the last component of this composite can be non-zero. + assert lastEoc == 0 : lastEoc; + + boolean isReversed = false; + // Decode the next type's simple class name that is encoded before its fully qualified class name (in order + // for comparisons to work correctly). + String simpleClassName = ByteSourceInverse.getString(ByteSourceInverse.nextComponentSource(comparableBytes, separator)); + if (REVERSED_TYPE.equals(simpleClassName)) + { + // Special-handle if the type is reversed (and decode the actual base type simple class name). + isReversed = true; + simpleClassName = ByteSourceInverse.getString(ByteSourceInverse.nextComponentSource(comparableBytes)); + } + + // Decode the type's fully qualified class name and parse the actual type from it. + String fullClassName = ByteSourceInverse.getString(ByteSourceInverse.nextComponentSource(comparableBytes)); + assert fullClassName.endsWith(simpleClassName); + if (isReversed) + fullClassName = REVERSED_TYPE + '(' + fullClassName + ')'; + AbstractType type = TypeParser.parse(fullClassName); + assert type != null; + types.add(type); + + // Decode the payload from this type. + V value = type.fromComparableBytes(accessor, ByteSourceInverse.nextComponentSource(comparableBytes), version); + values.add(value); + + // Also decode the corresponding end-of-component byte - the last one we decode will be taken into + // account when we deserialize the decoded data into an object. + lastEoc = ByteSourceInverse.getSignedByte(ByteSourceInverse.nextComponentSource(comparableBytes)); + } + return build(accessor, types, inverseMapping, values, lastEoc); + } + + public static ByteBuffer build(List types, List values) + { + return build(ByteBufferAccessor.instance, + Lists.transform(types, TypeParser::parse), + Collections.emptyMap(), + values, + (byte) 0); + } + + @VisibleForTesting + public static V build(ValueAccessor accessor, + List> types, + Map, Byte> inverseMapping, + List values, + byte lastEoc) + { + assert types.size() == values.size(); + + int numComponents = types.size(); + // Compute the total number of bytes that we'll need to store the types and their payloads. + int totalLength = 0; + for (int i = 0; i < numComponents; ++i) + { + AbstractType type = types.get(i); + Byte alias = inverseMapping.get(type); + int typeNameLength = alias == null ? type.toString().getBytes(StandardCharsets.UTF_8).length : 0; + // The type data will be stored by means of the type's fully qualified name, not by aliasing, so: + // 1. The type data header should be the fully qualified name length in bytes. + // 2. The length should be small enough so that it fits in 15 bits (2 bytes with the first bit zero). + assert typeNameLength <= 0x7FFF; + int valueLength = accessor.size(values.get(i)); + // The value length should also expect its first bit to be 0, as the length should be stored as a signed + // 2-byte value (short). + assert valueLength <= 0x7FFF; + totalLength += 2 + typeNameLength + 2 + valueLength + 1; + } + + V result = accessor.allocate(totalLength); + int offset = 0; + for (int i = 0; i < numComponents; ++i) + { + AbstractType type = types.get(i); + Byte alias = inverseMapping.get(type); + if (alias == null) + { + // Write the type data (2-byte length header + the fully qualified type name in UTF-8). + byte[] typeNameBytes = type.toString().getBytes(StandardCharsets.UTF_8); + accessor.putShort(result, + offset, + (short) typeNameBytes.length); // this should work fine also if length >= 32768 + offset += 2; + accessor.copyByteArrayTo(typeNameBytes, 0, result, offset, typeNameBytes.length); + offset += typeNameBytes.length; + } + else + { + accessor.putShort(result, offset, (short) (alias | 0x8000)); + offset += 2; + } + + // Write the type payload data (2-byte length header + the payload). + V value = values.get(i); + int bytesToCopy = accessor.size(value); + accessor.putShort(result, offset, (short) bytesToCopy); + offset += 2; + accessor.copyTo(value, 0, result, accessor, offset, bytesToCopy); + offset += bytesToCopy; + + // Write the end-of-component byte. + accessor.putByte(result, offset, i != numComponents - 1 ? (byte) 0 : lastEoc); + offset += 1; + } + return result; + } + protected ParsedComparator parseComparator(int i, String part) { return new DynamicParsedComparator(part); diff --git a/src/java/org/apache/cassandra/db/marshal/EmptyType.java b/src/java/org/apache/cassandra/db/marshal/EmptyType.java index 357b6e85ad15..dcc57b7c4a75 100644 --- a/src/java/org/apache/cassandra/db/marshal/EmptyType.java +++ b/src/java/org/apache/cassandra/db/marshal/EmptyType.java @@ -33,6 +33,8 @@ import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; import org.apache.cassandra.utils.NoSpamLogger; /** @@ -68,6 +70,18 @@ private static NonEmptyWriteBehavior parseNonEmptyWriteBehavior() private EmptyType() {super(ComparisonType.CUSTOM);} // singleton + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, ByteComparable.Version version) + { + return null; + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + return accessor.empty(); + } + public int compareCustom(VL left, ValueAccessor accessorL, VR right, ValueAccessor accessorR) { return 0; diff --git a/src/java/org/apache/cassandra/db/marshal/FloatType.java b/src/java/org/apache/cassandra/db/marshal/FloatType.java index 35abee0f98ed..2adb127d4194 100644 --- a/src/java/org/apache/cassandra/db/marshal/FloatType.java +++ b/src/java/org/apache/cassandra/db/marshal/FloatType.java @@ -27,6 +27,9 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; public class FloatType extends NumberType @@ -51,6 +54,18 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return compareComposed(left, accessorL, right, accessorR, this); } + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, ByteComparable.Version version) + { + return ByteSource.optionalSignedFixedLengthFloat(accessor, data); + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + return ByteSourceInverse.getOptionalSignedFixedLengthFloat(accessor, comparableBytes, 4); + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/Int32Type.java b/src/java/org/apache/cassandra/db/marshal/Int32Type.java index 98f4c83cf64c..6dee26e22423 100644 --- a/src/java/org/apache/cassandra/db/marshal/Int32Type.java +++ b/src/java/org/apache/cassandra/db/marshal/Int32Type.java @@ -28,6 +28,9 @@ import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; public class Int32Type extends NumberType { @@ -55,6 +58,18 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return ValueAccessor.compare(left, accessorL, right, accessorR); } + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, ByteComparable.Version version) + { + return ByteSource.optionalSignedFixedLengthNumber(accessor, data); + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + return ByteSourceInverse.getOptionalSignedFixedLength(accessor, comparableBytes, 4); + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/IntegerType.java b/src/java/org/apache/cassandra/db/marshal/IntegerType.java index 4c913d50afee..b52bda890017 100644 --- a/src/java/org/apache/cassandra/db/marshal/IntegerType.java +++ b/src/java/org/apache/cassandra/db/marshal/IntegerType.java @@ -30,11 +30,23 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; public final class IntegerType extends NumberType { public static final IntegerType instance = new IntegerType(); + // Constants or escaping values needed to encode/decode variable-length integers in our custom byte-ordered + // encoding scheme. + private static final int POSITIVE_VARINT_HEADER = 0x80; + private static final int NEGATIVE_VARINT_LENGTH_HEADER = 0x00; + private static final int POSITIVE_VARINT_LENGTH_HEADER = 0xFF; + private static final byte BIG_INTEGER_NEGATIVE_LEADING_ZERO = (byte) 0xFF; + private static final byte BIG_INTEGER_POSITIVE_LEADING_ZERO = (byte) 0x00; + public static final int FULL_FORM_THRESHOLD = 7; + private static int findMostSignificantByte(V value, ValueAccessor accessor) { int len = accessor.size(value) - 1; @@ -131,6 +143,301 @@ public static int compareIntegers(VL lhs, ValueAccessor accessorL, return 0; } + /** + * Constructs a byte-comparable representation of the number. + * + * In the current format we represent it: + * directly as varint, if the length is 6 or smaller (the encoding has non-00/FF first byte) + * <7 or more bytes>, otherwise + * where is 00 for negative numbers and FF for positive ones, and the length's bytes are inverted if + * the number is negative (so that longer length sorts smaller). + * + * Because we present the sign separately, we don't need to include 0x00 prefix for positive integers whose first + * byte is >= 0x80 or 0xFF prefix for negative integers whose first byte is < 0x80. Note that we do this before + * taking the length for the purposes of choosing between varint and full-form encoding. + * + * The representations are prefix-free, because the choice between varint and full-form encoding is determined by + * the first byte where varints are properly ordered between full-form negative and full-form positive, varint + * encoding is prefix-free, and full-form representations of different length always have length bytes that differ. + * + * Examples: + * -1 as 7F + * 0 as 80 + * 1 as 81 + * 127 as C07F + * 255 as C0FF + * 2^32-1 as F8FFFFFFFF + * 2^32 as F900000000 + * 2^56-1 as FEFFFFFFFFFFFFFF + * 2^56 as FF000100000000000000 + * + * See {@link #asComparableBytesLegacy} for description of the legacy format. + */ + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, ByteComparable.Version version) + { + final int limit = accessor.size(data); + if (limit == 0) + return null; + + // skip any leading sign-only byte(s) + int p = 0; + final byte signbyte = accessor.getByte(data, p); + if (signbyte == BIG_INTEGER_NEGATIVE_LEADING_ZERO || signbyte == BIG_INTEGER_POSITIVE_LEADING_ZERO) + { + while (p + 1 < limit) + { + if (accessor.getByte(data, ++p) != signbyte) + break; + } + } + + if (version != ByteComparable.Version.LEGACY) + return (limit - p < FULL_FORM_THRESHOLD) + ? encodeAsVarInt(accessor, data, limit) + : asComparableBytesCurrent(accessor, data, p, limit, (signbyte >> 7) & 0xFF); + else + return asComparableBytesLegacy(accessor, data, p, limit, signbyte); + } + + /** + * Encode the BigInteger stored in the given buffer as a variable-length signed integer. + * The length of the number is given in the limit argument, and must be <= 8. + */ + private ByteSource encodeAsVarInt(ValueAccessor accessor, V data, int limit) + { + long v; + switch (limit) + { + case 1: + v = accessor.getByte(data, 0); + break; + case 2: + v = accessor.getShort(data, 0); + break; + case 3: + v = (accessor.getShort(data, 0) << 8) | (accessor.getByte(data, 2) & 0xFF); + break; + case 4: + v = accessor.getInt(data, 0); + break; + case 5: + v = ((long) accessor.getInt(data, 0) << 8) | (accessor.getByte(data, 4) & 0xFF); + break; + case 6: + v = ((long) accessor.getInt(data, 0) << 16) | (accessor.getShort(data, 4) & 0xFFFF); + break; + case 7: + v = ((long) accessor.getInt(data, 0) << 24) | ((accessor.getShort(data, 4) & 0xFFFF) << 8) | (accessor.getByte(data, 6) & 0xFF); + break; + case 8: + // This is not reachable within the encoding; added for completeness. + v = accessor.getLong(data, 0); + break; + default: + throw new AssertionError(); + } + return ByteSource.variableLengthInteger(v); + } + + /** + * Constructs a full-form byte-comparable representation of the number in the current format. + * + * This contains: + * <7 or more bytes>, otherwise + * where is 00 for negative numbers and FF for positive ones, and the length's bytes are inverted if + * the number is negative (so that longer length sorts smaller). + * + * Because we present the sign separately, we don't need to include 0x00 prefix for positive integers whose first + * byte is >= 0x80 or 0xFF prefix for negative integers whose first byte is < 0x80. + * + * The representations are prefix-free, because representations of different length always have length bytes that + * differ. + */ + private ByteSource asComparableBytesCurrent(ValueAccessor accessor, V data, int startpos, int limit, int signbyte) + { + // start with sign as a byte, then variable-length-encoded length, then bytes (stripped leading sign) + return new ByteSource() + { + int pos = -2; + ByteSource lengthEncoding = new VariableLengthUnsignedInteger(limit - startpos - FULL_FORM_THRESHOLD); + + @Override + public int next() + { + if (pos == -2) + { + ++pos; + return signbyte ^ 0xFF; // 00 for negative/FF for positive (01-FE for direct varint encoding) + } + else if (pos == -1) + { + int nextByte = lengthEncoding.next(); + if (nextByte != END_OF_STREAM) + return nextByte ^ signbyte; + pos = startpos; + } + + if (pos == limit) + return END_OF_STREAM; + + return accessor.getByte(data, pos++) & 0xFF; + } + }; + } + + /** + * Constructs a byte-comparable representation of the number in the legacy format. + * We represent it as + * + * where a length_byte is: + * - 0x80 + (length - 1) for positive numbers (so that longer length sorts bigger) + * - 0x7F - (length - 1) for negative numbers (so that longer length sorts smaller) + * + * Because we include the sign in the length byte: + * - unlike fixed-length ints, we don't need to sign-invert the first significant byte, + * - unlike BigInteger, we don't need to include 0x00 prefix for positive integers whose first byte is >= 0x80 + * or 0xFF prefix for negative integers whose first byte is < 0x80. + * + * The representations are prefix-free, because representations of different length always have length bytes that + * differ. + * + * Examples: + * 0 as 8000 + * 1 as 8001 + * 127 as 807F + * 255 as 80FF + * 2^31-1 as 837FFFFFFF + * 2^31 as 8380000000 + * 2^32 as 840100000000 + */ + private ByteSource asComparableBytesLegacy(ValueAccessor accessor, V data, int startpos, int limit, int signbyte) + { + return new ByteSource() + { + int pos = startpos; + int sizeToReport = limit - startpos; + boolean sizeReported = false; + + public int next() + { + if (!sizeReported) + { + if (sizeToReport >= 128) + { + sizeToReport -= 128; + return signbyte >= 0 + ? POSITIVE_VARINT_LENGTH_HEADER + : NEGATIVE_VARINT_LENGTH_HEADER; + } + else + { + sizeReported = true; + return signbyte >= 0 + ? POSITIVE_VARINT_HEADER + (sizeToReport - 1) + : POSITIVE_VARINT_HEADER - sizeToReport; + } + } + + if (pos == limit) + return END_OF_STREAM; + + return accessor.getByte(data, pos++) & 0xFF; + } + }; + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + assert version != ByteComparable.Version.LEGACY; + if (comparableBytes == null) + return accessor.empty(); + + // Consume the first byte to determine whether the encoded number is positive and + // start iterating through the length header bytes and collecting the number of value bytes. + int sign = comparableBytes.peek() ^ 0xFF; // FF if negative, 00 if positive + if (sign != 0xFF && sign != 0x00) + return extractVarIntBytes(accessor, ByteSourceInverse.getVariableLengthInteger(comparableBytes)); + + // consume the sign byte + comparableBytes.next(); + + // Read the length (inverted if the number is negative) + int valueBytes = Math.toIntExact(ByteSourceInverse.getVariableLengthUnsignedIntegerXoring(comparableBytes, sign) + FULL_FORM_THRESHOLD); + // Get the bytes. + return extractBytes(accessor, comparableBytes, sign, valueBytes); + } + + private V extractVarIntBytes(ValueAccessor accessor, long value) + { + int length = (64 - Long.numberOfLeadingZeros(value ^ (value >> 63)) + 8) / 8; // number of bytes needed: 7 bits -> one byte, 8 bits -> 2 bytes + V buf = accessor.allocate(length); + switch (length) + { + case 1: + accessor.putByte(buf, 0, (byte) value); + break; + case 2: + accessor.putShort(buf, 0, (short) value); + break; + case 3: + accessor.putShort(buf, 0, (short) (value >> 8)); + accessor.putByte(buf, 2, (byte) value); + break; + case 4: + accessor.putInt(buf, 0, (int) value); + break; + case 5: + accessor.putInt(buf, 0, (int) (value >> 8)); + accessor.putByte(buf, 4, (byte) value); + break; + case 6: + accessor.putInt(buf, 0, (int) (value >> 16)); + accessor.putShort(buf, 4, (short) value); + break; + case 7: + accessor.putInt(buf, 0, (int) (value >> 24)); + accessor.putShort(buf, 4, (short) (value >> 8)); + accessor.putByte(buf, 6, (byte) value); + break; + case 8: + // This is not reachable within the encoding; added for completeness. + accessor.putLong(buf, 0, value); + break; + default: + throw new AssertionError(); + } + return buf; + } + + private V extractBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, int sign, int valueBytes) + { + int writtenBytes = 0; + V buf; + // Add "leading zero" if needed (i.e. in case the leading byte of a positive number corresponds to a negative + // value, or in case the leading byte of a negative number corresponds to a non-negative value). + // Size the array containing all the value bytes accordingly. + int curr = comparableBytes.next(); + if ((curr & 0x80) != (sign & 0x80)) + { + ++valueBytes; + buf = accessor.allocate(valueBytes); + accessor.putByte(buf, writtenBytes++, (byte) sign); + } + else + buf = accessor.allocate(valueBytes); + // Don't forget to add the first consumed value byte after determining whether leading zero should be added + // and sizing the value bytes array. + accessor.putByte(buf, writtenBytes++, (byte) curr); + + // Consume exactly the number of expected value bytes. + while (writtenBytes < valueBytes) + accessor.putByte(buf, writtenBytes++, (byte) comparableBytes.next()); + + return buf; + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java b/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java index 6dd41616f04d..81ec9d9a566e 100644 --- a/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java +++ b/src/java/org/apache/cassandra/db/marshal/LexicalUUIDType.java @@ -26,6 +26,9 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.serializers.UUIDSerializer; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; public class LexicalUUIDType extends AbstractType { @@ -48,6 +51,46 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return accessorL.toUUID(left).compareTo(accessorR.toUUID(right)); } + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, ByteComparable.Version version) + { + if (data == null || accessor.isEmpty(data)) + return null; + + // fixed-length (hence prefix-free) representation, but + // we have to sign-flip the highest bytes of the two longs + return new ByteSource() + { + int bufpos = 0; + + public int next() + { + if (bufpos >= accessor.size(data)) + return END_OF_STREAM; + int v = accessor.getByte(data, bufpos) & 0xFF; + if (bufpos == 0 || bufpos == 8) + v ^= 0x80; + ++bufpos; + return v; + } + }; + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + // Optional-style encoding of empty values as null sources + if (comparableBytes == null) + return accessor.empty(); + + long hiBits = ByteSourceInverse.getSignedLong(comparableBytes); + long loBits = ByteSourceInverse.getSignedLong(comparableBytes); + + // Lexical UUIDs are stored as just two signed longs. The decoding of these longs flips their sign bit back, so + // they can directly be used for constructing the original UUID. + return UUIDType.makeUuidBytes(accessor, hiBits, loBits); + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/ListType.java b/src/java/org/apache/cassandra/db/marshal/ListType.java index 281f7ee4cbb2..f795def3a784 100644 --- a/src/java/org/apache/cassandra/db/marshal/ListType.java +++ b/src/java/org/apache/cassandra/db/marshal/ListType.java @@ -18,21 +18,24 @@ package org.apache.cassandra.db.marshal; import java.nio.ByteBuffer; -import java.util.*; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Iterator; +import java.util.List; import java.util.concurrent.ConcurrentHashMap; import org.apache.cassandra.cql3.Json; import org.apache.cassandra.cql3.Lists; import org.apache.cassandra.cql3.Term; -import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.db.rows.Cell; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.exceptions.SyntaxException; -import org.apache.cassandra.serializers.CollectionSerializer; import org.apache.cassandra.serializers.ListSerializer; import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.utils.TimeUUID; import org.apache.cassandra.transport.ProtocolVersion; +import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version; +import org.apache.cassandra.utils.bytecomparable.ByteSource; public class ListType extends CollectionType> { @@ -171,29 +174,16 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return compareListOrSet(elements, left, accessorL, right, accessorR); } - static int compareListOrSet(AbstractType elementsComparator, VL left, ValueAccessor accessorL, VR right, ValueAccessor accessorR) + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, Version version) { - // Note that this is only used if the collection is frozen - if (accessorL.isEmpty(left) || accessorR.isEmpty(right)) - return Boolean.compare(accessorR.isEmpty(right), accessorL.isEmpty(left)); - - int sizeL = CollectionSerializer.readCollectionSize(left, accessorL, ProtocolVersion.V3); - int offsetL = CollectionSerializer.sizeOfCollectionSize(sizeL, ProtocolVersion.V3); - int sizeR = CollectionSerializer.readCollectionSize(right, accessorR, ProtocolVersion.V3); - int offsetR = TypeSizes.INT_SIZE; - - for (int i = 0; i < Math.min(sizeL, sizeR); i++) - { - VL v1 = CollectionSerializer.readValue(left, accessorL, offsetL, ProtocolVersion.V3); - offsetL += CollectionSerializer.sizeOfValue(v1, accessorL, ProtocolVersion.V3); - VR v2 = CollectionSerializer.readValue(right, accessorR, offsetR, ProtocolVersion.V3); - offsetR += CollectionSerializer.sizeOfValue(v2, accessorR, ProtocolVersion.V3); - int cmp = elementsComparator.compare(v1, accessorL, v2, accessorR); - if (cmp != 0) - return cmp; - } + return asComparableBytesListOrSet(getElementsType(), accessor, data, version); + } - return sizeL == sizeR ? 0 : (sizeL < sizeR ? -1 : 1); + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, Version version) + { + return fromComparableBytesListOrSet(accessor, comparableBytes, version, getElementsType()); } @Override @@ -242,23 +232,6 @@ public Term fromJSONObject(Object parsed) throws MarshalException return new Lists.DelayedValue(terms); } - public static String setOrListToJsonString(ByteBuffer buffer, AbstractType elementsType, ProtocolVersion protocolVersion) - { - ByteBuffer value = buffer.duplicate(); - StringBuilder sb = new StringBuilder("["); - int size = CollectionSerializer.readCollectionSize(value, protocolVersion); - int offset = CollectionSerializer.sizeOfCollectionSize(size, protocolVersion); - for (int i = 0; i < size; i++) - { - if (i > 0) - sb.append(", "); - ByteBuffer element = CollectionSerializer.readValue(value, ByteBufferAccessor.instance, offset, protocolVersion); - offset += CollectionSerializer.sizeOfValue(element, ByteBufferAccessor.instance, protocolVersion); - sb.append(elementsType.toJSONString(element, protocolVersion)); - } - return sb.append("]").toString(); - } - public ByteBuffer getSliceFromSerialized(ByteBuffer collection, ByteBuffer from, ByteBuffer to) { // We don't support slicing on lists so we don't need that function diff --git a/src/java/org/apache/cassandra/db/marshal/LongType.java b/src/java/org/apache/cassandra/db/marshal/LongType.java index ad539f70de70..6bf5e9e66990 100644 --- a/src/java/org/apache/cassandra/db/marshal/LongType.java +++ b/src/java/org/apache/cassandra/db/marshal/LongType.java @@ -28,6 +28,9 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; public class LongType extends NumberType { @@ -57,6 +60,28 @@ public static int compareLongs(VL left, ValueAccessor accessorL, VR return ValueAccessor.compare(left, accessorL, right, accessorR); } + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, ByteComparable.Version version) + { + if (accessor.isEmpty(data)) + return null; + if (version == ByteComparable.Version.LEGACY) + return ByteSource.signedFixedLengthNumber(accessor, data); + else + return ByteSource.variableLengthInteger(accessor.getLong(data, 0)); + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + if (comparableBytes == null) + return accessor.empty(); + if (version == ByteComparable.Version.LEGACY) + return ByteSourceInverse.getSignedFixedLength(accessor, comparableBytes, 8); + else + return accessor.valueOf(ByteSourceInverse.getVariableLengthInteger(comparableBytes)); + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/MapType.java b/src/java/org/apache/cassandra/db/marshal/MapType.java index 9473e2913618..be74ff1626fe 100644 --- a/src/java/org/apache/cassandra/db/marshal/MapType.java +++ b/src/java/org/apache/cassandra/db/marshal/MapType.java @@ -28,9 +28,13 @@ import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.exceptions.SyntaxException; import org.apache.cassandra.serializers.CollectionSerializer; -import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.serializers.MapSerializer; +import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; import org.apache.cassandra.utils.Pair; public class MapType extends CollectionType> @@ -215,7 +219,71 @@ public static int compareMaps(AbstractType keysComparator, AbstractT return cmp; } - return sizeL == sizeR ? 0 : (sizeL < sizeR ? -1 : 1); + return Integer.compare(sizeL, sizeR); + } + + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, Version version) + { + return asComparableBytesMap(getKeysType(), getValuesType(), accessor, data, version); + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, Version version) + { + return fromComparableBytesMap(accessor, comparableBytes, version, getKeysType(), getValuesType()); + } + + static ByteSource asComparableBytesMap(AbstractType keysComparator, + AbstractType valuesComparator, + ValueAccessor accessor, + V data, + Version version) + { + if (accessor.isEmpty(data)) + return null; + + ProtocolVersion protocolVersion = ProtocolVersion.V3; + int offset = 0; + int size = CollectionSerializer.readCollectionSize(data, accessor, protocolVersion); + offset += CollectionSerializer.sizeOfCollectionSize(size, protocolVersion); + ByteSource[] srcs = new ByteSource[size * 2]; + for (int i = 0; i < size; ++i) + { + V k = CollectionSerializer.readValue(data, accessor, offset, protocolVersion); + offset += CollectionSerializer.sizeOfValue(k, accessor, protocolVersion); + srcs[i * 2 + 0] = keysComparator.asComparableBytes(accessor, k, version); + V v = CollectionSerializer.readValue(data, accessor, offset, protocolVersion); + offset += CollectionSerializer.sizeOfValue(v, accessor, protocolVersion); + srcs[i * 2 + 1] = valuesComparator.asComparableBytes(accessor, v, version); + } + return ByteSource.withTerminatorMaybeLegacy(version, 0x00, srcs); + } + + static V fromComparableBytesMap(ValueAccessor accessor, + ByteSource.Peekable comparableBytes, + Version version, + AbstractType keysComparator, + AbstractType valuesComparator) + { + if (comparableBytes == null) + return accessor.empty(); + assert version != ByteComparable.Version.LEGACY; // legacy translation is not reversible + + List buffers = new ArrayList<>(); + int separator = comparableBytes.next(); + while (separator != ByteSource.TERMINATOR) + { + buffers.add(ByteSourceInverse.nextComponentNull(separator) + ? null + : keysComparator.fromComparableBytes(accessor, comparableBytes, version)); + separator = comparableBytes.next(); + buffers.add(ByteSourceInverse.nextComponentNull(separator) + ? null + : valuesComparator.fromComparableBytes(accessor, comparableBytes, version)); + separator = comparableBytes.next(); + } + return CollectionSerializer.pack(buffers, accessor,buffers.size() / 2, ProtocolVersion.V3); } @Override @@ -286,7 +354,7 @@ public String toJSONString(ByteBuffer buffer, ProtocolVersion protocolVersion) { ByteBuffer value = buffer.duplicate(); StringBuilder sb = new StringBuilder("{"); - int size = CollectionSerializer.readCollectionSize(value, protocolVersion); + int size = CollectionSerializer.readCollectionSize(value, ByteBufferAccessor.instance, protocolVersion); int offset = CollectionSerializer.sizeOfCollectionSize(size, protocolVersion); for (int i = 0; i < size; i++) { diff --git a/src/java/org/apache/cassandra/db/marshal/PartitionerDefinedOrder.java b/src/java/org/apache/cassandra/db/marshal/PartitionerDefinedOrder.java index 89241b416bb4..02c28e751348 100644 --- a/src/java/org/apache/cassandra/db/marshal/PartitionerDefinedOrder.java +++ b/src/java/org/apache/cassandra/db/marshal/PartitionerDefinedOrder.java @@ -22,11 +22,15 @@ import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.Term; +import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.PartitionPosition; import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.dht.IPartitioner; import org.apache.cassandra.transport.ProtocolVersion; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version; +import org.apache.cassandra.utils.bytecomparable.ByteSource; import org.apache.cassandra.utils.FBUtilities; /** for sorting columns representing row keys in the row ordering as determined by a partitioner. @@ -93,6 +97,33 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return PartitionPosition.ForKey.get(accessorL.toBuffer(left), partitioner).compareTo(PartitionPosition.ForKey.get(accessorR.toBuffer(right), partitioner)); } + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, Version version) + { + // Partitioners work with ByteBuffers only. + ByteBuffer buf = ByteBufferAccessor.instance.convert(data, accessor); + if (version != Version.LEGACY) + { + // For ByteComparable.Version.OSS42 and above we encode an empty key with a null byte source. This + // way we avoid the need to special-handle a sentinel value when we decode the byte source for such a key + // (e.g. for ByteComparable.Version.Legacy we use the minimum key bound of the partitioner's minimum token as + // a sentinel value, and that results in the need to go twice through the byte source that is being + // decoded). + return buf.hasRemaining() ? partitioner.decorateKey(buf).asComparableBytes(version) : null; + } + return PartitionPosition.ForKey.get(buf, partitioner).asComparableBytes(version); + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + assert version != Version.LEGACY; + if (comparableBytes == null) + return accessor.empty(); + byte[] keyBytes = DecoratedKey.keyFromByteSource(comparableBytes, version, partitioner); + return accessor.valueOf(keyBytes); + } + @Override public void validate(ByteBuffer bytes) throws MarshalException { diff --git a/src/java/org/apache/cassandra/db/marshal/ReversedType.java b/src/java/org/apache/cassandra/db/marshal/ReversedType.java index ceea84a39f0e..eac800aec4a2 100644 --- a/src/java/org/apache/cassandra/db/marshal/ReversedType.java +++ b/src/java/org/apache/cassandra/db/marshal/ReversedType.java @@ -28,6 +28,8 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.transport.ProtocolVersion; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; public class ReversedType extends AbstractType { @@ -63,6 +65,32 @@ public boolean isEmptyValueMeaningless() return baseType.isEmptyValueMeaningless(); } + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, ByteComparable.Version version) + { + ByteSource src = baseType.asComparableBytes(accessor, data, version); + if (src == null) // Note: this will only compare correctly if used within a sequence + return null; + // Invert all bytes. + // The comparison requirements for the original type ensure that this encoding will compare correctly with + // respect to the reversed comparator function (and, specifically, prefixes of escaped byte-ordered types will + // compare as larger). Additionally, the weak prefix-freedom requirement ensures this encoding will also be + // weakly prefix-free. + return () -> + { + int v = src.next(); + if (v == ByteSource.END_OF_STREAM) + return v; + return v ^ 0xFF; + }; + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + return baseType.fromComparableBytes(accessor, ReversedPeekableByteSource.of(comparableBytes), version); + } + public int compareCustom(VL left, ValueAccessor accessorL, VR right, ValueAccessor accessorR) { return baseType.compare(right, accessorR, left, accessorL); @@ -156,4 +184,38 @@ public String toString() { return getClass().getName() + "(" + baseType + ")"; } + + private static final class ReversedPeekableByteSource extends ByteSource.Peekable + { + private final ByteSource.Peekable original; + + static ByteSource.Peekable of(ByteSource.Peekable original) + { + return original != null ? new ReversedPeekableByteSource(original) : null; + } + + private ReversedPeekableByteSource(ByteSource.Peekable original) + { + super(null); + this.original = original; + } + + @Override + public int next() + { + int v = original.next(); + if (v != END_OF_STREAM) + return v ^ 0xFF; + return END_OF_STREAM; + } + + @Override + public int peek() + { + int v = original.peek(); + if (v != END_OF_STREAM) + return v ^ 0xFF; + return END_OF_STREAM; + } + } } diff --git a/src/java/org/apache/cassandra/db/marshal/SetType.java b/src/java/org/apache/cassandra/db/marshal/SetType.java index e5bdadab25f8..67699ac3daf5 100644 --- a/src/java/org/apache/cassandra/db/marshal/SetType.java +++ b/src/java/org/apache/cassandra/db/marshal/SetType.java @@ -30,6 +30,8 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.serializers.SetSerializer; import org.apache.cassandra.transport.ProtocolVersion; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; public class SetType extends CollectionType> { @@ -154,7 +156,19 @@ public boolean isValueCompatibleWithFrozen(CollectionType previous) public int compareCustom(VL left, ValueAccessor accessorL, VR right, ValueAccessor accessorR) { - return ListType.compareListOrSet(elements, left, accessorL, right, accessorR); + return compareListOrSet(elements, left, accessorL, right, accessorR); + } + + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, ByteComparable.Version version) + { + return asComparableBytesListOrSet(getElementsType(), accessor, data, version); + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + return fromComparableBytesListOrSet(accessor, comparableBytes, version, getElementsType()); } public SetSerializer getSerializer() @@ -210,6 +224,6 @@ public Term fromJSONObject(Object parsed) throws MarshalException @Override public String toJSONString(ByteBuffer buffer, ProtocolVersion protocolVersion) { - return ListType.setOrListToJsonString(buffer, elements, protocolVersion); + return setOrListToJsonString(buffer, elements, protocolVersion); } } diff --git a/src/java/org/apache/cassandra/db/marshal/ShortType.java b/src/java/org/apache/cassandra/db/marshal/ShortType.java index 03dcf5d31446..013fa959497a 100644 --- a/src/java/org/apache/cassandra/db/marshal/ShortType.java +++ b/src/java/org/apache/cassandra/db/marshal/ShortType.java @@ -28,6 +28,9 @@ import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; public class ShortType extends NumberType { @@ -46,6 +49,19 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return ValueAccessor.compare(left, accessorL, right, accessorR); } + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, ByteComparable.Version version) + { + // This type does not allow non-present values, but we do just to avoid future complexity. + return ByteSource.optionalSignedFixedLengthNumber(accessor, data); + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + return ByteSourceInverse.getOptionalSignedFixedLength(accessor, comparableBytes, 2); + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/SimpleDateType.java b/src/java/org/apache/cassandra/db/marshal/SimpleDateType.java index 8f1d677f035b..a0de2c20892e 100644 --- a/src/java/org/apache/cassandra/db/marshal/SimpleDateType.java +++ b/src/java/org/apache/cassandra/db/marshal/SimpleDateType.java @@ -28,6 +28,10 @@ import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest; @@ -37,6 +41,20 @@ public class SimpleDateType extends TemporalType SimpleDateType() {super(ComparisonType.BYTE_ORDER);} // singleton + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, Version version) + { + // While BYTE_ORDER would still work for this type, making use of the fixed length is more efficient. + // This type does not allow non-present values, but we do just to avoid future complexity. + return ByteSource.optionalFixedLength(accessor, data); + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + return ByteSourceInverse.getOptionalFixedLength(accessor, comparableBytes, 4); + } + public ByteBuffer fromString(String source) throws MarshalException { return ByteBufferUtil.bytes(SimpleDateSerializer.dateStringToDays(source)); diff --git a/src/java/org/apache/cassandra/db/marshal/TimeType.java b/src/java/org/apache/cassandra/db/marshal/TimeType.java index fd8fac47454e..f029b8bb94a8 100644 --- a/src/java/org/apache/cassandra/db/marshal/TimeType.java +++ b/src/java/org/apache/cassandra/db/marshal/TimeType.java @@ -28,6 +28,10 @@ import org.apache.cassandra.serializers.TypeSerializer; import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.transport.ProtocolVersion; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; /** * Nanosecond resolution time values @@ -42,6 +46,20 @@ public ByteBuffer fromString(String source) throws MarshalException return decompose(TimeSerializer.timeStringToLong(source)); } + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, Version version) + { + // While BYTE_ORDER would still work for this type, making use of the fixed length is more efficient. + // This type does not allow non-present values, but we do just to avoid future complexity. + return ByteSource.optionalFixedLength(accessor, data); + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + return ByteSourceInverse.getOptionalFixedLength(accessor, comparableBytes, 8); + } + @Override public boolean isValueCompatibleWithInternal(AbstractType otherType) { diff --git a/src/java/org/apache/cassandra/db/marshal/TimestampType.java b/src/java/org/apache/cassandra/db/marshal/TimestampType.java index ccf1da3e20be..5bca7b1f56db 100644 --- a/src/java/org/apache/cassandra/db/marshal/TimestampType.java +++ b/src/java/org/apache/cassandra/db/marshal/TimestampType.java @@ -32,6 +32,9 @@ import org.apache.cassandra.serializers.TimestampSerializer; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; import static org.apache.cassandra.cql3.statements.RequestValidations.invalidRequest; @@ -60,6 +63,18 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return LongType.compareLongs(left, accessorL, right, accessorR); } + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, ByteComparable.Version version) + { + return ByteSource.optionalSignedFixedLengthNumber(accessor, data); + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + return ByteSourceInverse.getOptionalSignedFixedLength(accessor, comparableBytes, 8); + } + public ByteBuffer fromString(String source) throws MarshalException { // Return an empty ByteBuffer for an empty string. diff --git a/src/java/org/apache/cassandra/db/marshal/TupleType.java b/src/java/org/apache/cassandra/db/marshal/TupleType.java index cc0848765867..c203770bc7b7 100644 --- a/src/java/org/apache/cassandra/db/marshal/TupleType.java +++ b/src/java/org/apache/cassandra/db/marshal/TupleType.java @@ -30,11 +30,12 @@ import org.apache.cassandra.cql3.*; import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.exceptions.ConfigurationException; -import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.exceptions.SyntaxException; import org.apache.cassandra.serializers.*; import org.apache.cassandra.transport.ProtocolVersion; -import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; import static com.google.common.collect.Iterables.any; import static com.google.common.collect.Iterables.transform; @@ -200,47 +201,136 @@ private boolean allRemainingComponentsAreNull(T v, ValueAccessor accessor return true; } + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, ByteComparable.Version version) + { + switch (version) + { + case LEGACY: + return asComparableBytesLegacy(accessor, data); + case OSS42: + return asComparableBytesNew(accessor, data, version); + default: + throw new AssertionError(); + } + } + + private ByteSource asComparableBytesLegacy(ValueAccessor accessor, V data) + { + if (accessor.isEmpty(data)) + return null; + + V[] bufs = split(accessor, data); // this may be shorter than types.size -- other srcs remain null in that case + ByteSource[] srcs = new ByteSource[types.size()]; + for (int i = 0; i < bufs.length; ++i) + srcs[i] = bufs[i] != null ? types.get(i).asComparableBytes(accessor, bufs[i], ByteComparable.Version.LEGACY) : null; + + // We always have a fixed number of sources, with the trailing ones possibly being nulls. + // This can only result in a prefix if the last type in the tuple allows prefixes. Since that type is required + // to be weakly prefix-free, so is the tuple. + return ByteSource.withTerminatorLegacy(ByteSource.END_OF_STREAM, srcs); + } + + private ByteSource asComparableBytesNew(ValueAccessor accessor, V data, ByteComparable.Version version) + { + if (accessor.isEmpty(data)) + return null; + + V[] bufs = split(accessor, data); + int lengthWithoutTrailingNulls = 0; + for (int i = 0; i < bufs.length; ++i) + if (bufs[i] != null) + lengthWithoutTrailingNulls = i + 1; + + ByteSource[] srcs = new ByteSource[lengthWithoutTrailingNulls]; + for (int i = 0; i < lengthWithoutTrailingNulls; ++i) + srcs[i] = bufs[i] != null ? types.get(i).asComparableBytes(accessor, bufs[i], version) : null; + + // Because we stop early when there are trailing nulls, there needs to be an explicit terminator to make the + // type prefix-free. + return ByteSource.withTerminator(ByteSource.TERMINATOR, srcs); + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + assert version == ByteComparable.Version.OSS42; // Reverse translation is not supported for the legacy version. + if (comparableBytes == null) + return accessor.empty(); + + V[] componentBuffers = accessor.createArray(types.size()); + for (int i = 0; i < types.size(); ++i) + { + if (comparableBytes.peek() == ByteSource.TERMINATOR) + break; // the rest of the fields remain null + AbstractType componentType = types.get(i); + ByteSource.Peekable component = ByteSourceInverse.nextComponentSource(comparableBytes); + if (component != null) + componentBuffers[i] = componentType.fromComparableBytes(accessor, component, version); + else + componentBuffers[i] = null; + } + // consume terminator + int terminator = comparableBytes.next(); + assert terminator == ByteSource.TERMINATOR : String.format("Expected TERMINATOR (0x%2x) after %d components", + ByteSource.TERMINATOR, + types.size()); + return buildValue(accessor, componentBuffers); + } + /** * Split a tuple value into its component values. */ - public ByteBuffer[] split(ByteBuffer value) + public V[] split(ValueAccessor accessor, V value) { - return split(value, size(), this); + return split(accessor, value, size(), this); } /** * Split a tuple value into its component values. */ - public static ByteBuffer[] split(ByteBuffer value, int numberOfElements, TupleType type) + public static V[] split(ValueAccessor accessor, V value, int numberOfElements, TupleType type) { - ByteBuffer[] components = new ByteBuffer[numberOfElements]; - ByteBuffer input = value.duplicate(); + V[] components = accessor.createArray(numberOfElements); + int length = accessor.size(value); + int position = 0; for (int i = 0; i < numberOfElements; i++) { - if (!input.hasRemaining()) + if (position == length) return Arrays.copyOfRange(components, 0, i); - int size = input.getInt(); - - if (input.remaining() < size) + if (position + 4 > length) throw new MarshalException(String.format("Not enough bytes to read %dth component", i)); + int size = accessor.getInt(value, position); + position += 4; + // size < 0 means null value - components[i] = size < 0 ? null : ByteBufferUtil.readBytes(input, size); + if (size >= 0) + { + if (position + size > length) + throw new MarshalException(String.format("Not enough bytes to read %dth component", i)); + + components[i] = accessor.slice(value, position, size); + position += size; + } + else + components[i] = null; } // error out if we got more values in the tuple/UDT than we expected - if (input.hasRemaining()) + if (position < length) { - throw new InvalidRequestException(String.format( - "Expected %s %s for %s column, but got more", - numberOfElements, numberOfElements == 1 ? "value" : "values", type.asCQL3Type())); + throw new MarshalException(String.format("Expected %s %s for %s column, but got more", + numberOfElements, numberOfElements == 1 ? "value" : "values", + type.asCQL3Type())); } return components; } - public static V buildValue(ValueAccessor accessor, V[] components) + @SafeVarargs + public static V buildValue(ValueAccessor accessor, V... components) { int totalLength = 0; for (V component : components) @@ -264,7 +354,7 @@ public static V buildValue(ValueAccessor accessor, V[] components) return result; } - public static ByteBuffer buildValue(ByteBuffer[] components) + public static ByteBuffer buildValue(ByteBuffer... components) { return buildValue(ByteBufferAccessor.instance, components); } diff --git a/src/java/org/apache/cassandra/db/marshal/UUIDType.java b/src/java/org/apache/cassandra/db/marshal/UUIDType.java index 55ce59dae798..9ec8063fae62 100644 --- a/src/java/org/apache/cassandra/db/marshal/UUIDType.java +++ b/src/java/org/apache/cassandra/db/marshal/UUIDType.java @@ -30,6 +30,9 @@ import org.apache.cassandra.serializers.MarshalException; import org.apache.cassandra.serializers.UUIDSerializer; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; import org.apache.cassandra.utils.UUIDGen; /** @@ -96,9 +99,71 @@ public int compareCustom(VL left, ValueAccessor accessorL, VR right return c; } + // Amusingly (or not so much), although UUIDType freely takes time UUIDs (UUIDs with version 1), it compares + // them differently than TimeUUIDType. This is evident in the least significant bytes comparison (the code + // below for UUIDType), where UUIDType treats them as unsigned bytes, while TimeUUIDType compares the bytes + // signed. See CASSANDRA-8730 for details around this discrepancy. return UnsignedLongs.compare(accessorL.getLong(left, 8), accessorR.getLong(right, 8)); } + @Override + public ByteSource asComparableBytes(ValueAccessor accessor, V data, ByteComparable.Version v) + { + if (accessor.isEmpty(data)) + return null; + + long msb = accessor.getLong(data, 0); + long version = ((msb >>> 12) & 0xf); + ByteBuffer swizzled = ByteBuffer.allocate(16); + + if (version == 1) + swizzled.putLong(0, TimeUUIDType.reorderTimestampBytes(msb)); + else + swizzled.putLong(0, (version << 60) | ((msb >>> 4) & 0x0FFFFFFFFFFFF000L) | (msb & 0xFFFL)); + + swizzled.putLong(8, accessor.getLong(data, 8)); + + // fixed-length thus prefix-free + return ByteSource.fixedLength(swizzled); + } + + @Override + public V fromComparableBytes(ValueAccessor accessor, ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + // Optional-style encoding of empty values as null sources + if (comparableBytes == null) + return accessor.empty(); + + // The UUID bits are stored as an unsigned fixed-length 128-bit integer. + long hiBits = ByteSourceInverse.getUnsignedFixedLengthAsLong(comparableBytes, 8); + long loBits = ByteSourceInverse.getUnsignedFixedLengthAsLong(comparableBytes, 8); + + long uuidVersion = hiBits >>> 60 & 0xF; + if (uuidVersion == 1) + { + // If the version bits are set to 1, this is a time-based UUID, and its high bits are significantly more + // shuffled than in other UUIDs. Revert the shuffle. + hiBits = TimeUUIDType.reorderBackTimestampBytes(hiBits); + } + else + { + // For non-time UUIDs, the only thing that's needed is to put the version bits back where they were originally. + hiBits = hiBits << 4 & 0xFFFFFFFFFFFF0000L + | uuidVersion << 12 + | hiBits & 0x0000000000000FFFL; + } + + return makeUuidBytes(accessor, hiBits, loBits); + } + + static V makeUuidBytes(ValueAccessor accessor, long high, long low) + { + V buffer = accessor.allocate(16); + accessor.putLong(buffer, 0, high); + accessor.putLong(buffer, 8, low); + return buffer; + } + @Override public boolean isValueCompatibleWithInternal(AbstractType otherType) { diff --git a/src/java/org/apache/cassandra/db/marshal/UserType.java b/src/java/org/apache/cassandra/db/marshal/UserType.java index 29afad9583d4..24c05e255967 100644 --- a/src/java/org/apache/cassandra/db/marshal/UserType.java +++ b/src/java/org/apache/cassandra/db/marshal/UserType.java @@ -258,7 +258,7 @@ public Term fromJSONObject(Object parsed) throws MarshalException @Override public String toJSONString(ByteBuffer buffer, ProtocolVersion protocolVersion) { - ByteBuffer[] buffers = split(buffer); + ByteBuffer[] buffers = split(ByteBufferAccessor.instance, buffer); StringBuilder sb = new StringBuilder("{"); for (int i = 0; i < types.size(); i++) { diff --git a/src/java/org/apache/cassandra/db/marshal/ValueAccessor.java b/src/java/org/apache/cassandra/db/marshal/ValueAccessor.java index a51836e65aaa..d454c5e1883f 100644 --- a/src/java/org/apache/cassandra/db/marshal/ValueAccessor.java +++ b/src/java/org/apache/cassandra/db/marshal/ValueAccessor.java @@ -68,6 +68,7 @@ public interface ObjectFactory Cell cell(ColumnMetadata column, long timestamp, int ttl, int localDeletionTime, V value, CellPath path); Clustering clustering(V... values); Clustering clustering(); + Clustering staticClustering(); ClusteringBound bound(ClusteringPrefix.Kind kind, V... values); ClusteringBound bound(ClusteringPrefix.Kind kind); ClusteringBoundary boundary(ClusteringPrefix.Kind kind, V... values); @@ -105,7 +106,6 @@ default ClusteringBoundary exclusiveCloseInclusiveOpen(boolean reversed, V[] { return boundary(reversed ? INCL_END_EXCL_START_BOUNDARY : EXCL_END_INCL_START_BOUNDARY, boundValues); } - } /** * @return the size of the given value @@ -330,6 +330,12 @@ default boolean getBoolean(V value, int offset) /** returns a TimeUUID from offset 0 */ Ballot toBallot(V value); + /** + * writes the byte value {@param value} to {@param dst} at offset {@param offset} + * @return the number of bytes written to {@param value} + */ + int putByte(V dst, int offset, byte value); + /** * writes the short value {@param value} to {@param dst} at offset {@param offset} * @return the number of bytes written to {@param value} diff --git a/src/java/org/apache/cassandra/db/rows/EncodingStats.java b/src/java/org/apache/cassandra/db/rows/EncodingStats.java index 37dd34e92185..518285d6715c 100644 --- a/src/java/org/apache/cassandra/db/rows/EncodingStats.java +++ b/src/java/org/apache/cassandra/db/rows/EncodingStats.java @@ -67,7 +67,7 @@ public class EncodingStats implements IMeasurableMemory // We should use this sparingly obviously public static final EncodingStats NO_STATS = new EncodingStats(TIMESTAMP_EPOCH, DELETION_TIME_EPOCH, TTL_EPOCH); - public static long HEAP_SIZE = ObjectSizes.measure(NO_STATS); + public static final long HEAP_SIZE = ObjectSizes.measure(NO_STATS); public static final Serializer serializer = new Serializer(); diff --git a/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java b/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java index 3a5db52bd776..2b0e2a286147 100644 --- a/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java +++ b/src/java/org/apache/cassandra/dht/ByteOrderedPartitioner.java @@ -26,6 +26,9 @@ import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.Hex; import org.apache.cassandra.utils.ObjectSizes; @@ -101,6 +104,12 @@ public boolean equals(Object obj) return Arrays.equals(token, other.token); } + @Override + public ByteSource asComparableBytes(ByteComparable.Version version) + { + return ByteSource.of(token, version); + } + @Override public IPartitioner getPartitioner() { @@ -222,6 +231,11 @@ public BytesToken getRandomToken(Random random) private final Token.TokenFactory tokenFactory = new Token.TokenFactory() { + public Token fromComparableBytes(ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + return new BytesToken(ByteSourceInverse.getUnescapedBytes(comparableBytes)); + } + public ByteBuffer toByteArray(Token token) { BytesToken bytesToken = (BytesToken) token; diff --git a/src/java/org/apache/cassandra/dht/LocalPartitioner.java b/src/java/org/apache/cassandra/dht/LocalPartitioner.java index c7c6df0d73b5..df976701aa90 100644 --- a/src/java/org/apache/cassandra/dht/LocalPartitioner.java +++ b/src/java/org/apache/cassandra/dht/LocalPartitioner.java @@ -26,7 +26,10 @@ import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.CachedHashDecoratedKey; import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; import org.apache.cassandra.utils.ObjectSizes; import org.apache.cassandra.utils.memory.HeapAllocator; @@ -83,6 +86,12 @@ public Token.TokenFactory getTokenFactory() private final Token.TokenFactory tokenFactory = new Token.TokenFactory() { + public Token fromComparableBytes(ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + ByteBuffer tokenData = comparator.fromComparableBytes(ByteBufferAccessor.instance, comparableBytes, version); + return new LocalToken(tokenData); + } + public ByteBuffer toByteArray(Token token) { return ((LocalToken)token).token; @@ -174,6 +183,12 @@ public boolean equals(Object obj) return token.equals(other.token); } + @Override + public ByteSource asComparableBytes(ByteComparable.Version version) + { + return comparator.asComparableBytes(ByteBufferAccessor.instance, token, version); + } + @Override public IPartitioner getPartitioner() { diff --git a/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java b/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java index e2daac412cca..015610fb5346 100644 --- a/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java +++ b/src/java/org/apache/cassandra/dht/Murmur3Partitioner.java @@ -33,6 +33,9 @@ import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; import org.apache.cassandra.utils.MurmurHash; import org.apache.cassandra.utils.ObjectSizes; @@ -176,6 +179,12 @@ public int compareTo(Token o) return Long.compare(token, ((LongToken) o).token); } + @Override + public ByteSource asComparableBytes(ByteComparable.Version version) + { + return ByteSource.of(token); + } + @Override public IPartitioner getPartitioner() { @@ -326,6 +335,12 @@ public Token.TokenFactory getTokenFactory() private final Token.TokenFactory tokenFactory = new Token.TokenFactory() { + public Token fromComparableBytes(ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + long tokenData = ByteSourceInverse.getSignedLong(comparableBytes); + return new LongToken(tokenData); + } + public ByteBuffer toByteArray(Token token) { LongToken longToken = (LongToken) token; diff --git a/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java b/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java index 16c5db17a448..2d4def95d0f0 100644 --- a/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java +++ b/src/java/org/apache/cassandra/dht/OrderPreservingPartitioner.java @@ -33,6 +33,9 @@ import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.apache.cassandra.utils.bytecomparable.ByteSourceInverse; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.ObjectSizes; import org.apache.cassandra.utils.Pair; @@ -128,6 +131,11 @@ public StringToken getRandomToken(Random random) private final Token.TokenFactory tokenFactory = new Token.TokenFactory() { + public Token fromComparableBytes(ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + return new StringToken(ByteSourceInverse.getString(comparableBytes)); + } + public ByteBuffer toByteArray(Token token) { StringToken stringToken = (StringToken) token; @@ -194,6 +202,12 @@ public long getHeapSize() { return EMPTY_SIZE + ObjectSizes.sizeOf(token); } + + @Override + public ByteSource asComparableBytes(ByteComparable.Version version) + { + return ByteSource.of(token, version); + } } public StringToken getToken(ByteBuffer key) diff --git a/src/java/org/apache/cassandra/dht/RandomPartitioner.java b/src/java/org/apache/cassandra/dht/RandomPartitioner.java index 241b7850fdf7..d02cfd58adfd 100644 --- a/src/java/org/apache/cassandra/dht/RandomPartitioner.java +++ b/src/java/org/apache/cassandra/dht/RandomPartitioner.java @@ -27,6 +27,8 @@ import com.google.common.annotations.VisibleForTesting; import org.apache.cassandra.db.CachedHashDecoratedKey; +import org.apache.cassandra.db.marshal.ByteArrayAccessor; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.marshal.AbstractType; @@ -34,6 +36,8 @@ import org.apache.cassandra.db.marshal.PartitionerDefinedOrder; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.GuidGenerator; import org.apache.cassandra.utils.ObjectSizes; @@ -158,6 +162,11 @@ private boolean isValidToken(BigInteger token) { private final Token.TokenFactory tokenFactory = new Token.TokenFactory() { + public Token fromComparableBytes(ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + return fromByteArray(IntegerType.instance.fromComparableBytes(ByteBufferAccessor.instance, comparableBytes, version)); + } + public ByteBuffer toByteArray(Token token) { BigIntegerToken bigIntegerToken = (BigIntegerToken) token; @@ -244,6 +253,12 @@ public BigIntegerToken(String token) this(new BigInteger(token)); } + @Override + public ByteSource asComparableBytes(ByteComparable.Version version) + { + return IntegerType.instance.asComparableBytes(ByteArrayAccessor.instance, token.toByteArray(), version); + } + @Override public IPartitioner getPartitioner() { diff --git a/src/java/org/apache/cassandra/dht/Token.java b/src/java/org/apache/cassandra/dht/Token.java index d8e82f82c510..3543dabc0e3a 100644 --- a/src/java/org/apache/cassandra/dht/Token.java +++ b/src/java/org/apache/cassandra/dht/Token.java @@ -26,6 +26,8 @@ import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; public abstract class Token implements RingPosition, Serializable { @@ -37,8 +39,31 @@ public static abstract class TokenFactory { public abstract ByteBuffer toByteArray(Token token); public abstract Token fromByteArray(ByteBuffer bytes); + + /** + * Produce a byte-comparable representation of the token. + * See {@link Token#asComparableBytes} + */ + public ByteSource asComparableBytes(Token token, ByteComparable.Version version) + { + return token.asComparableBytes(version); + } + + /** + * Translates the given byte-comparable representation to a token instance. If the given bytes don't correspond + * to the encoding of an instance of the expected token type, an {@link IllegalArgumentException} may be thrown. + * + * @param comparableBytes A byte-comparable representation (presumably of a token of some expected token type). + * @return A new {@link Token} instance, corresponding to the given byte-ordered representation. If we were + * to call {@link #asComparableBytes(ByteComparable.Version)} on the returned object, we should get a + * {@link ByteSource} equal to the input one as a result. + * @throws IllegalArgumentException if the bytes do not encode a valid token. + */ + public abstract Token fromComparableBytes(ByteSource.Peekable comparableBytes, ByteComparable.Version version); + public abstract String toString(Token token); // serialize as string, not necessarily human-readable public abstract Token fromString(String string); // deserialize + public abstract void validate(String token) throws ConfigurationException; public void serialize(Token token, DataOutputPlus out) throws IOException @@ -99,6 +124,20 @@ public long serializedSize(Token object, int version) abstract public long getHeapSize(); abstract public Object getTokenValue(); + /** + * Produce a weakly prefix-free byte-comparable representation of the token, i.e. such a sequence of bytes that any + * pair x, y of valid tokens of this type and any bytes b1, b2 between 0x10 and 0xEF, + * (+ stands for concatenation) + * compare(x, y) == compareLexicographicallyUnsigned(asByteComparable(x)+b1, asByteComparable(y)+b2) + * (i.e. the values compare like the original type, and an added 0x10-0xEF byte at the end does not change that) and: + * asByteComparable(x)+b1 is not a prefix of asByteComparable(y) (weakly prefix free) + * (i.e. a valid representation of a value may be a prefix of another valid representation of a value only if the + * following byte in the latter is smaller than 0x10 or larger than 0xEF). These properties are trivially true if + * the encoding compares correctly and is prefix free, but also permits a little more freedom that enables somewhat + * more efficient encoding of arbitrary-length byte-comparable blobs. + */ + abstract public ByteSource asComparableBytes(ByteComparable.Version version); + /** * Returns a measure for the token space covered between this token and next. * Used by the token allocation algorithm (see CASSANDRA-7032). @@ -128,7 +167,7 @@ public boolean isMinimum() /* * A token corresponds to the range of all the keys having this token. - * A token is thus no comparable directly to a key. But to be able to select + * A token is thus not comparable directly to a key. But to be able to select * keys given tokens, we introduce two "fake" keys for each token T: * - lowerBoundKey: a "fake" key representing the lower bound T represents. * In other words, lowerBoundKey is the smallest key that @@ -190,6 +229,20 @@ public int compareTo(PartitionPosition pos) return ((pos instanceof KeyBound) && !((KeyBound)pos).isMinimumBound) ? 0 : 1; } + @Override + public ByteSource asComparableBytes(Version version) + { + int terminator = isMinimumBound ? ByteSource.LT_NEXT_COMPONENT : ByteSource.GT_NEXT_COMPONENT; + return ByteSource.withTerminator(terminator, token.asComparableBytes(version)); + } + + @Override + public ByteComparable asComparableBound(boolean before) + { + // This class is already a bound thus nothing needs to be changed from its representation + return this; + } + public IPartitioner getPartitioner() { return getToken().getPartitioner(); diff --git a/src/java/org/apache/cassandra/serializers/BooleanSerializer.java b/src/java/org/apache/cassandra/serializers/BooleanSerializer.java index d372a2ad7736..403e6b75b0f4 100644 --- a/src/java/org/apache/cassandra/serializers/BooleanSerializer.java +++ b/src/java/org/apache/cassandra/serializers/BooleanSerializer.java @@ -24,8 +24,8 @@ public class BooleanSerializer extends TypeSerializer { - private static final ByteBuffer TRUE = ByteBuffer.wrap(new byte[] {1}); - private static final ByteBuffer FALSE = ByteBuffer.wrap(new byte[] {0}); + public static final ByteBuffer TRUE = ByteBuffer.wrap(new byte[] {1}); + public static final ByteBuffer FALSE = ByteBuffer.wrap(new byte[] {0}); public static final BooleanSerializer instance = new BooleanSerializer(); diff --git a/src/java/org/apache/cassandra/serializers/CollectionSerializer.java b/src/java/org/apache/cassandra/serializers/CollectionSerializer.java index eb2991b8d78c..204261d46fd7 100644 --- a/src/java/org/apache/cassandra/serializers/CollectionSerializer.java +++ b/src/java/org/apache/cassandra/serializers/CollectionSerializer.java @@ -91,11 +91,6 @@ protected static void writeCollectionSize(ByteBuffer output, int elements, Proto output.putInt(elements); } - public static int readCollectionSize(ByteBuffer input, ProtocolVersion version) - { - return readCollectionSize(input, ByteBufferAccessor.instance, version); - } - public static int readCollectionSize(V value, ValueAccessor accessor, ProtocolVersion version) { return accessor.toInt(value); diff --git a/src/java/org/apache/cassandra/serializers/MapSerializer.java b/src/java/org/apache/cassandra/serializers/MapSerializer.java index 13468fc0aafc..400a8e7cc5fb 100644 --- a/src/java/org/apache/cassandra/serializers/MapSerializer.java +++ b/src/java/org/apache/cassandra/serializers/MapSerializer.java @@ -148,7 +148,7 @@ public ByteBuffer getSerializedValue(ByteBuffer collection, ByteBuffer key, Abst try { ByteBuffer input = collection.duplicate(); - int n = readCollectionSize(input, ProtocolVersion.V3); + int n = readCollectionSize(input, ByteBufferAccessor.instance, ProtocolVersion.V3); int offset = sizeOfCollectionSize(n, ProtocolVersion.V3); for (int i = 0; i < n; i++) { @@ -184,7 +184,7 @@ public ByteBuffer getSliceFromSerialized(ByteBuffer collection, try { ByteBuffer input = collection.duplicate(); - int n = readCollectionSize(input, ProtocolVersion.V3); + int n = readCollectionSize(input, ByteBufferAccessor.instance, ProtocolVersion.V3); input.position(input.position() + sizeOfCollectionSize(n, ProtocolVersion.V3)); int startPos = input.position(); int count = 0; diff --git a/src/java/org/apache/cassandra/serializers/SetSerializer.java b/src/java/org/apache/cassandra/serializers/SetSerializer.java index 6be919ff553f..54b849609682 100644 --- a/src/java/org/apache/cassandra/serializers/SetSerializer.java +++ b/src/java/org/apache/cassandra/serializers/SetSerializer.java @@ -156,7 +156,7 @@ public ByteBuffer getSerializedValue(ByteBuffer input, ByteBuffer key, AbstractT { try { - int n = readCollectionSize(input, ProtocolVersion.V3); + int n = readCollectionSize(input, ByteBufferAccessor.instance, ProtocolVersion.V3); int offset = sizeOfCollectionSize(n, ProtocolVersion.V3); for (int i = 0; i < n; i++) @@ -192,7 +192,7 @@ public ByteBuffer getSliceFromSerialized(ByteBuffer collection, try { ByteBuffer input = collection.duplicate(); - int n = readCollectionSize(input, ProtocolVersion.V3); + int n = readCollectionSize(input, ByteBufferAccessor.instance, ProtocolVersion.V3); input.position(input.position() + sizeOfCollectionSize(n, ProtocolVersion.V3)); int startPos = input.position(); int count = 0; diff --git a/src/java/org/apache/cassandra/service/paxos/PaxosRepairHistory.java b/src/java/org/apache/cassandra/service/paxos/PaxosRepairHistory.java index a88e83136e10..5e9fad130885 100644 --- a/src/java/org/apache/cassandra/service/paxos/PaxosRepairHistory.java +++ b/src/java/org/apache/cassandra/service/paxos/PaxosRepairHistory.java @@ -29,6 +29,7 @@ import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.TypeSizes; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; import org.apache.cassandra.db.marshal.BytesType; import org.apache.cassandra.db.marshal.TimeUUIDType; import org.apache.cassandra.db.marshal.TupleType; @@ -181,7 +182,7 @@ public static PaxosRepairHistory fromTupleBufferList(List tuples) Ballot[] ballotLowBounds = new Ballot[tuples.size()]; for (int i = 0 ; i < tuples.size() ; ++i) { - ByteBuffer[] split = TYPE.split(tuples.get(i)); + ByteBuffer[] split = TYPE.split(ByteBufferAccessor.instance, tuples.get(i)); if (i < tokenInclusiveUpperBounds.length) tokenInclusiveUpperBounds[i] = TOKEN_FACTORY.fromByteArray(split[0]); ballotLowBounds[i] = Ballot.deserialize(split[1]); diff --git a/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.java b/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.java new file mode 100644 index 000000000000..4bccb40c4c6e --- /dev/null +++ b/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.java @@ -0,0 +1,163 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils.bytecomparable; + +import java.nio.ByteBuffer; + +/** + * Interface indicating a value can be represented/identified by a comparable {@link ByteSource}. + * + * All Cassandra types that can be used as part of a primary key have a corresponding byte-comparable translation, + * detailed in ByteComparable.md. Byte-comparable representations are used in some memtable as well as primary and + * secondary index implementations. + */ +public interface ByteComparable +{ + /** + * Returns a source that generates the byte-comparable representation of the value byte by byte. + */ + ByteSource asComparableBytes(Version version); + + enum Version + { + LEGACY, // Encoding used in legacy sstable format; forward (value to byte-comparable) translation only + OSS42, // CASSANDRA 4.2 encoding + } + + ByteComparable EMPTY = (Version version) -> ByteSource.EMPTY; + + /** + * Construct a human-readable string from the byte-comparable representation. Used for debugging. + */ + default String byteComparableAsString(Version version) + { + StringBuilder builder = new StringBuilder(); + ByteSource stream = asComparableBytes(version); + if (stream == null) + return "null"; + for (int b = stream.next(); b != ByteSource.END_OF_STREAM; b = stream.next()) + builder.append(Integer.toHexString((b >> 4) & 0xF)).append(Integer.toHexString(b & 0xF)); + return builder.toString(); + } + + // Simple factories used for testing + + static ByteComparable of(String s) + { + return v -> ByteSource.of(s, v); + } + + static ByteComparable of(long value) + { + return v -> ByteSource.of(value); + } + + static ByteComparable of(int value) + { + return v -> ByteSource.of(value); + } + + static ByteComparable fixedLength(ByteBuffer bytes) + { + return v -> ByteSource.fixedLength(bytes); + } + + static ByteComparable fixedLength(byte[] bytes) + { + return v -> ByteSource.fixedLength(bytes); + } + + /** + * Returns a separator for two byte sources, i.e. something that is definitely > prevMax, and <= currMin, assuming + * prevMax < currMin. + * This returns the shortest prefix of currMin that is greater than prevMax. + */ + static ByteComparable separatorPrefix(ByteComparable prevMax, ByteComparable currMin) + { + return version -> ByteSource.separatorPrefix(prevMax.asComparableBytes(version), currMin.asComparableBytes(version)); + } + + /** + * Returns a separator for two byte comparable, i.e. something that is definitely > prevMax, and <= currMin, assuming + * prevMax < currMin. + * This is a stream of length 1 longer than the common prefix of the two streams, with last byte one higher than the + * prevMax stream. + */ + static ByteComparable separatorGt(ByteComparable prevMax, ByteComparable currMin) + { + return version -> ByteSource.separatorGt(prevMax.asComparableBytes(version), currMin.asComparableBytes(version)); + } + + static ByteComparable cut(ByteComparable src, int cutoff) + { + return version -> ByteSource.cut(src.asComparableBytes(version), cutoff); + } + + /** + * Return the length of a byte comparable, not including the terminator byte. + */ + static int length(ByteComparable src, Version version) + { + int l = 0; + ByteSource s = src.asComparableBytes(version); + while (s.next() != ByteSource.END_OF_STREAM) + ++l; + return l; + } + + /** + * Compare two byte-comparable values by their byte-comparable representation. Used for tests. + * + * @return the result of the lexicographic unsigned byte comparison of the byte-comparable representations of the + * two arguments + */ + static int compare(ByteComparable bytes1, ByteComparable bytes2, Version version) + { + ByteSource s1 = bytes1.asComparableBytes(version); + ByteSource s2 = bytes2.asComparableBytes(version); + + if (s1 == null || s2 == null) + return Boolean.compare(s1 != null, s2 != null); + + while (true) + { + int b1 = s1.next(); + int b2 = s2.next(); + int cmp = Integer.compare(b1, b2); + if (cmp != 0) + return cmp; + if (b1 == ByteSource.END_OF_STREAM) + return 0; + } + } + + /** + * Returns the length of the minimum prefix that differentiates the two given byte-comparable representations. + */ + static int diffPoint(ByteComparable bytes1, ByteComparable bytes2, Version version) + { + ByteSource s1 = bytes1.asComparableBytes(version); + ByteSource s2 = bytes2.asComparableBytes(version); + int pos = 1; + int b; + while ((b = s1.next()) == s2.next() && b != ByteSource.END_OF_STREAM) + ++pos; + return pos; + } +} diff --git a/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.md b/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.md new file mode 100644 index 000000000000..f3606353813f --- /dev/null +++ b/src/java/org/apache/cassandra/utils/bytecomparable/ByteComparable.md @@ -0,0 +1,693 @@ + + +# Byte-comparable translation of types (ByteComparable/ByteSource) + +## Problem / Motivation + +Cassandra has a very heavy reliance on comparisons — they are used throughout read and write paths, coordination, +compaction, etc. to be able to order and merge results. It also supports a range of types which often require the +compared object to be completely in memory to order correctly, which in turn has necessitated interfaces where +comparisons can only be applied if the compared objects are completely loaded. + +This has some rather painful implications on the performance of the database, both in terms of the time it takes to load, +compare and garbage collect, as well as in terms of the space required to hold complete keys in on-disk indices and +deserialized versions in in-memory data structures. In addition to this, the reliance on comparisons forces Cassandra to +use only comparison-based structures, which aren’t the most efficient. + +There is no way to escape the need to compare and order objects in Cassandra, but the machinery for doing this can be +done much more smartly if we impose some simple structure in the objects we deal with — byte ordering. + +The term “byte order” as used in this document refers to the property of being ordered via lexicographic compare on the +unsigned values of the byte contents. Some of the types in Cassandra already have this property (e.g. strings, blobs), +but other most heavily used ones (e.g. integers, uuids) don’t. + +When byte order is universally available for the types used for keys, several key advantages can be put to use: +- Comparisons can be done using a single simple method, core machinery doesn’t need to know anything about types. +- Prefix differences are enough to define order; unique prefixes can be used instead of complete keys. +- Tries can be used to store, query and iterate over ranges of keys, providing fast lookup and prefix compression. +- Merging can be performed by merging tries, significantly reducing the number of necessary comparisons. + +## Ordering the types + +As we want to keep all existing functionality in Cassandra, we need to be able to deal with existing +non-byte-order-comparable types. This requires some form of conversion of each value to a sequence of bytes that can be +byte-order compared (also called "byte-comparable"), as well as the inverse conversion from byte-comparable to value. + +As one of the main advantages of byte order is the ability to decide comparisons early, without having to read the whole +of the input sequence, byte-ordered interpretations of values are represented as sources of bytes with unknown length, +using the interface `ByteSource`. The interface declares one method, `next()` which produces the next byte of the +stream, or `ByteSource.END_OF_STREAM` if the stream is exhausted. + +`END_OF_STREAM` is chosen as `-1` (`(int) -1`, which is outside the range of possible byte values), to make comparing +two byte sources as trivial (and thus fast) as possible. + +To be able to completely abstract type information away from the storage machinery, we also flatten complex types into +single byte sequences. To do this, we add separator bytes in front, between components, and at the end and do some +encoding of variable-length sequences. + +The other interface provided by this package `ByteComparable`, is an entity whose byte-ordered interpretation can be +requested. The interface is implemented by `DecoratedKey`, and can be requested for clustering keys and bounds using +`ClusteringComparator.asByteComparable`. The inverse translation is provided by +`Buffer/NativeDecoratedKey.fromByteComparable` and `ClusteringComparator.clustering/bound/boundaryFromByteComparable`. + +The (rather technical) paragraphs below detail the encoding we have chosen for the various types. For simplicity we +only discuss the bidirectional `OSS42` version of the translation. The implementations in code of the various mappings +are in the releavant `AbstractType` subclass. + +### Desired properties + +Generally, we desire the following two properties from the byte-ordered translations of values we use in the database: +- Comparison equivalence (1): + + + + + x + , + y + + T + , + + compareBytesUnsigned + + + ( + T + . + + byteOrdered + + + ( + x + ) + + , + T + . + + byteOrdered + + + ( + y + ) + + ) + + = + T + . + + compare + + + ( + x + , + y + ) + + + + + +- Prefix-freedom (2): + + + + + x + , + y + + T + , + T + . + + byteOrdered + + + ( + x + ) + + + + is not a prefix of + + + T + . + + byteOrdered + + + ( + y + ) + + + + + + +The former is the essential requirement, and the latter allows construction of encodings of sequences of multiple +values, as well as a little more efficiency in the data structures. + +To more efficiently encode byte-ordered blobs, however, we use a slightly tweaked version of the above requirements: + +- Comparison equivalence (3): + + + + + x + , + y + + T + , + + + b + 1 + + , + + b + 2 + + + + [ + 0x10 + - + 0xEF + ] + + , +
    + + compareBytesUnsigned + + + ( + T + . + + byteOrdered + + + ( + x + ) + + + + + b + 1 + + , + T + . + + byteOrdered + + + ( + y + ) + + + + + b + 2 + + ) + + = + T + . + + compare + + + ( + x + , + y + ) + +
    + +
    +
    +- Weak prefix-freedom (4): + + + + + x + , + y + + T + , + + b + + + [ + 0x10 + - + 0xEF + ] + + , +
    + + ( + T + . + + byteOrdered + + + ( + x + ) + + + + b + ) + + + + is not a prefix of + + + T + . + + byteOrdered + + + ( + y + ) + +
    + +
    +
    + +These versions allow the addition of a separator byte after each value, and guarantee that the combination with +separator fulfills the original requirements. (3) is somewhat stronger than (1) but is necessarily true if (2) is also +in force, while (4) trivially follows from (2). + +## Fixed length unsigned integers (Murmur token, date/time) + +This is the trivial case, as we can simply use the input bytes in big-endian order. The comparison result is the same, +and fixed length values are trivially prefix free, i.e. (1) and (2) are satisfied, and thus (3) and (4) follow from the +observation above. + +## Fixed-length signed integers (byte, short, int, legacy bigint) + +As above, but we need to invert the sign bit of the number to put negative numbers before positives. This maps +`MIN_VALUE` to `0x00`..., `-1` to `0x7F…`, `0` to `0x80…`, and `MAX_VALUE` to `0xFF…`; comparing the resulting number +as an unsigned integer has the same effect as comparing the source signed. + +Examples: + +| Type and value | bytes |encodes as| +|----------------|-------------------------|----------| +| int 1 | 00 00 00 01 | 80 00 00 01 +| short -1 | FF FF | 7F FF +| byte 0 | 00 | 80 +| byte -2 | FE | 7E +| int MAX_VALUE | 7F FF FF FF | FF FF FF FF +| long MIN_VALUE | 80 00 00 00 00 00 00 00 | 00 00 00 00 00 00 00 00 + +## Variable-length encoding of integers (current bigint) + +Another way to encode integers that may save significant amounts of space when smaller numbers are often in use, but +still permits large values to be efficiently encoded, is to use an encoding scheme similar to UTF-8. + +For unsigned numbers this can be done by starting the number with as many 1s in most significant bits as there are +additional bytes in the encoding, followed by a 0, and the bits of the number. Numbers between 0 and 127 are encoded +in one byte, and each additional byte adds 7 more bits. Values that use all 8 bytes do not need a 9th bit of 0 and can +thus fit 9 bytes. Because longer numbers have more 1s in their MSBs, they compare +higher than shorter ones (and we always use the shortest representation). Because the length is specified through these +initial bits, no value can be a prefix of another. + +| Value | bytes |encodes as| +|------------------|-------------------------|----------| +| 0 | 00 00 00 00 00 00 00 00 | 00 +| 1 | 00 00 00 00 00 00 00 01 | 01 +| 127 (2^7-1) | 00 00 00 00 00 00 00 7F | 7F +| 128 (2^7) | 00 00 00 00 00 00 00 80 | 80 80 +| 16383 (2^14 - 1) | 00 00 00 00 00 00 3F FF | BF FF +| 16384 (2^14) | 00 00 00 00 00 00 40 00 | C0 40 00 +| 2^31 - 1 | 00 00 00 00 7F FF FF FF | F0 7F FF FF FF +| 2^31 | 00 00 00 00 80 00 00 00 | F0 80 00 00 00 +| 2^56 - 1 | 00 FF FF FF FF FF FF FF | FE FF FF FF FF FF FF FF +| 2^56 | 01 00 00 00 00 00 00 00 | FF 01 00 00 00 00 00 00 00 +| 2^64- 1 | FF FF FF FF FF FF FF FF | FF FF FF FF FF FF FF FF FF + + +To encode signed numbers, we must start with the sign bit, and must also ensure that longer negative numbers sort +smaller than shorter ones. The first bit of the encoding is the inverted sign (i.e. 1 for positive, 0 for negative), +followed by the length encoded as a sequence of bits that matches the inverted sign, followed by a bit that differs +(like above, not necessary for 9-byte encodings) and the bits of the number's two's complement. + +| Value | bytes |encodes as| +|-------------------|--------------------------|----------| +| 1 | 00 00 00 00 00 00 00 01 | 01 +| -1 | FF FF FF FF FF FF FF FF | 7F +| 0 | 00 00 00 00 00 00 00 00 | 80 +| 63 | 00 00 00 00 00 00 00 3F | BF +| -64 | FF FF FF FF FF FF FF C0 | 40 +| 64 | 00 00 00 00 00 00 00 40 | C0 40 +| -65 | FF FF FF FF FF FF FF BF | 3F BF +| 8191 | 00 00 00 00 00 00 1F FF | DF FF +| 8192 | 00 00 00 00 00 00 20 00 | E0 20 00 +| Integer.MAX_VALUE | 00 00 00 00 7F FF FF FF | F8 7F FF FF FF +| Long.MIN_VALUE | 80 00 00 00 00 00 00 00 | 00 00 00 00 00 00 00 00 00 + + +## Fixed-size floating-point numbers (float, double) + +IEEE-754 was designed with byte-by-byte comparisons in mind, and provides an important guarantee about the bytes of a +floating point number: +* If x and y are of the same sign, bytes(x) ≥ bytes(y) ⇔ |x| ≥ |y|. + +Thus, to be able to order floating point numbers as unsigned integers, we can: +* Flip the sign bit so negatives are smaller than positive numbers. +* If the number was negative, also flip all the other bits so larger magnitudes become smaller integers. + +This matches exactly the behaviour of `Double.compare`, which doesn’t fully agree with numerical comparisons (see spec) +in order to define a natural order over the floating point numbers. + +Examples: + +|Type and value|bytes|encodes as| +|---|---|---| +|float +1.0| 3F 80 00 00| BF 80 00 00| +|float +0.0| 00 00 00 00| 80 00 00 00| +|float -0.0| 80 00 00 00| 7F FF FF FF| +|float -1.0| BF 80 00 00| 40 7F FF FF| +|double +1.0| 3F F0 00 00 00 00 00 00| BF F0 00 00 00 00 00 00| +|double +Inf| 7F F0 00 00 00 00 00 00| FF F0 00 00 00 00 00 00| +|double -Inf| FF F0 00 00 00 00 00 00| 00 0F FF FF FF FF FF FF| +|double NaN| 7F F8 00 00 00 00 00 00| FF F8 00 00 00 00 00 00| + +## UUIDs +UUIDs are fixed-length unsigned integers, where the UUID version/type is compared first, and where bits need to be +reordered for the time UUIDs. To create a byte-ordered representation, we reorder the bytes: pull the version digit +first, then the rest of the digits, using the special time order if the version is equal to one. + +Examples: + +|Type and value|bytes|encodes as| +|---|---|---| +|Random (v4)| cc520882-9507-44fb-8fc9-b349ecdee658 | 4cc52088295074fb8fc9b349ecdee658 +|Time (v1) | 2a92d750-d8dc-11e6-a2de-cf8ecd4cf053 | 11e6d8dc2a92d750a2decf8ecd4cf053 + +## Multi-component sequences (Partition or Clustering keys, tuples), bounds and nulls + +As mentioned above, we encode sequences by adding separator bytes in front, between components, and a terminator at the +end. The values we chose for the separator and terminator are `0x40` and `0x38`, and they serve several purposes: +- Permits partially specified bounds, with strict/exclusive or non-strict/inclusive semantics. This is done by finishing + a bound with a terminator value that is smaller/greater than the separator and terminator. We can use `0x20` for `<`/`≥` + and `0x60` for `≤`/`>`. +- Permits encoding of `null` and `empty` values. We use `0x3E` as the separator for nulls and `0x3F` for empty, + followed by no value bytes. This is always smaller than a sequence with non-null value for this component, but not + smaller than a sequence that ends in this component. +- Helps identify the ending of variable-length components (see below). + +Examples: + +|Types and values|bytes|encodes as| +|---|---|---| +|(short 1, float 1.0) | 00 01, 3F 80 00 00 | 40·80 01·40·BF 80 00 00·38 +|(short -1, null) | FF FF, — | 40·7F FF·3E·38 +|≥ (short 0, float -Inf) | 00 00, FF 80 00 00, >=| 40·80 00·40·00 7F FF FF·20 +|< (short MIN) | 80 00, <= | 40·00 00·20 +|\> (null) | | 3E·60 +|BOTTOM | | 20 +|TOP | | 60 + +(The middle dot · doesn't exist in the encoding, it’s just a visualisation of the boundaries in the examples.) + +Since: +- all separators in use are within `0x10`-`0xEF`, and +- we use the same separator for internal components, with the exception of nulls which we encode with a smaller + separator +- the sequence has a fixed number of components or we use a different trailing value whenever it can be shorter + +the properties (3) and (4) guarantee that the byte comparison of the encoding goes in the same direction as the +lexicographical comparison of the sequence. In combination with the third point above, (4) also ensures that no encoding +is a prefix of another. Since we have (1) and (2), (3) and (4) are also satisfied. + +Note that this means that the encodings of all partition and clustering keys used in the database will be prefix-free. + +## Variable-length byte comparables (ASCII, UTF-8 strings, blobs, InetAddress) + +In isolation, these can be compared directly without reinterpretation. However, once we place these inside a flattened +sequence of values we need to clearly define the boundaries between values while maintaining order. To do this we use an +end-of-value marker; since shorter values must be smaller than longer, this marker must be 0 and we need to find a way +to encode/escape actual 0s in the input sequence. + +The method we chose for this is the following: +- If the input does not end on `00`, a `00` byte is appended at the end. +- If the input contains a `00` byte, it is encoded as `00 FF`. +- If the input contains a sequence of *n* `00` bytes, they are encoded as `00` `FE` (*n*-1 times) `FF` + (so that we don’t double the size of `00` blobs). +- If the input ends in `00`, the last `FF` is changed to `FE` + (to ensure it’s smaller than the same value with `00` appended). + +Examples: + +|bytes/sequence|encodes as| +|---|----| +|22 00 | 22 00 FE +|22 00 00 33 | 22 00 FE FF 33 00 +|22 00 11 | 22 00 FF 11 00 +|(blob 22, short 0) | 40·22 00·40·80 00·40 +| ≥ (blob 22 00) | 40·22 00 FE·20 +| ≤ (blob 22 00 00) | 40·22 00 FE FE·60 + +Within the encoding, a `00` byte can only be followed by a `FE` or `FF` byte, and hence if an encoding is a prefix of +another, the latter has to have a `FE` or `FF` as the next byte, which ensures both (4) (adding `10`-`EF` to the former +makes it no longer a prefix of the latter) and (3) (adding `10`-`EF` to the former makes it smaller than the latter; in +this case the original value of the former is a prefix of the original value of the latter). + +## Variable-length integers (varint, RandomPartitioner token), legacy encoding + +If integers of unbounded length are guaranteed to start with a non-zero digit, to compare them we can first use a signed +length, as numbers with longer representations have higher magnitudes. Only if the lengths match we need to compare the +sequence of digits, which now has a known length. + +(Note: The meaning of “digit” here is not the same as “decimal digit”. We operate with numbers stored as bytes, thus it +makes most sense to treat the numbers as encoded in base-256, where each digit is a byte.) + +This translates to the following encoding of varints: +- Strip any leading zeros. Note that for negative numbers, `BigInteger` encodes leading 0 as `0xFF`. +- If the length is 128 or greater, lead with a byte of `0xFF` (positive) or `0x00` (negative) for every 128 until there + are less than 128 left. +- Encode the sign and (remaining) length of the number as a byte: + - `0x80 + (length - 1)` for positive numbers (so that greater magnitude is higher); + - `0x7F - (length - 1)` for negative numbers (so that greater magnitude is lower, and all negatives are lower than + positives). +- Paste the bytes of the number, 2’s complement encoded for negative numbers (`BigInteger` already applies the 2’s + complement). + +Since when comparing two numbers we either have a difference in the length prefix, or the lengths are the same if we +need to compare the content bytes, there is no risk that a longer number can be confused with a shorter combined in a +multi-component sequence. In other words, no value can be a prefix of another, thus we have (1) and (2) and thus (3) and (4) +as well. + +Examples: + +| value | bytes |encodes as| +|--------:|------------------|---| +| 0 | 00 | 80·00 +| 1 | 01 | 80·01 +| -1 | FF | 7F·FF +| 255 | 00 FF | 80·FF +| -256 | FF 00 | 7F·00 +| 256 | 01 00 | 81·01 00 +| 2^16 | 01 00 00 | 82·01 00 00 +| -2^32 | FF 00 00 00 00 | 7C·00 00 00 00 +| 2^1024 | 01 00(128 times) | FF 80·01 00(128 times) +| -2^2048 | FF 00(256 times) | 00 00 80·00(256 times) + +(Middle dot · shows the transition point between length and digits.) + +## Variable-length integers, current encoding + +Because variable-length integers are also often used to store smaller range integers, it makes sense to also apply +the variable-length integer encoding. Thus, the current varint scheme chooses to: +- Strip any leading zeros. Note that for negative numbers, `BigInteger` encodes leading 0 as `0xFF`. +- Map numbers directly to their [variable-length integer encoding](#variable-length-encoding-of-integers-current-bigint), + if they have 6 bytes or less. +- Otherwise, encode as: + - a sign byte (00 for negative numbers, FF for positive, distinct from the leading byte of the variable-length + encoding above) + - a variable-length encoded number of bytes adjusted by -7 (so that the smallest length this encoding uses maps to + 0), inverted for negative numbers (so that greater length compares smaller) + - the bytes of the number, two's complement encoded. +We never use a longer encoding (e.g. using the second method if variable-length suffices or with added 00 leading +bytes) if a shorter one suffices. + +By the same reasoning as above, and the fact that the sign byte cannot be confused with a variable-length encoding +first byte, no value can be a prefix of another. As the sign byte compares smaller for negative (respectively bigger +for positive numbers) than any variable-length encoded integer, the comparison order is maintained when one number +uses variable-length encoding, and the other doesn't. Longer numbers compare smaller when negative (because of the +inverted length bytes), and bigger when positive. + +Examples: + +| value | bytes |encodes as| +|---------:|-------------------------|---| +| 0 | 00 | 80 +| 1 | 01 | 81 +| -1 | FF | 7F +| 255 | 00 FF | C0 FF +| -256 | FF 00 | 3F 00 +| 256 | 01 00 | C1 00 +| 2^16 | 01 00 00 | E1 00 00 +| -2^32 | FF 00 00 00 00 | 07 00 00 00 00 +| 2^56-1 | 00 FF FF FF FF FF FF FF | FE FF FF FF FF FF FF FF +| -2^56 | FF 00 00 00 00 00 00 00 | 01 00 00 00 00 00 00 00 +| 2^56 | 01 00 00 00 00 00 00 00 | FF·00·01 00 00 00 00 00 00 00 +| -2^56-1 | FE FF FF FF FF FF FF FF | 00·FF·FE FF FF FF FF FF FF FF +| 2^1024 | 01 00(128 times) | FF·7A·01 00(128 times) +| -2^2048 | FF 00(256 times) | 00·7F 06·00(256 times) + +(Middle dot · shows the transition point between length and digits.) + +## Variable-length floating-point decimals (decimal) + +Variable-length floats are more complicated, but we can treat them similarly to IEEE-754 floating point numbers, by +normalizing them by splitting them into sign, mantissa and signed exponent such that the mantissa is a number below 1 +with a non-zero leading digit. We can then compare sign, exponent and mantissa in sequence (where the comparison of +exponent and mantissa are with reversed meaning if the sign is negative) and that gives us the decimal ordering. + +A bit of extra care must be exercised when encoding decimals. Since fractions like `0.1` cannot be perfectly encoded in +binary, decimals (and mantissas) cannot be encoded in binary or base-256 correctly. A decimal base must be used; since +we deal with bytes, it makes most sense to make things a little more efficient by using base-100. Floating-point +encoding and the comparison idea from the previous paragraph work in any number base. + +`BigDecimal` presents a further challenge, as it encodes decimals using a mixture of bases: numbers have a binary- +encoded integer part and a decimal power-of-ten scale. The bytes produced by a `BigDecimal` are thus not suitable for +direct conversion to byte comparable and we must first instantiate the bytes as a `BigDecimal`, and then apply the +class’s methods to operate on it as a number. + +We then use the following encoding: +- If the number is 0, the encoding is a single `0x80` byte. +- Convert the input to signed mantissa and signed exponent in base-100. If the value is negative, invert the sign of the + exponent to form the "modulated exponent". +- Output a byte encoding: + - the sign of the number encoded as `0x80` if positive and `0x00` if negative, + - the exponent length (stripping leading 0s) in bytes as `0x40 + modulated_exponent_length`, where the length is given + with the sign of the modulated exponent. +- Output `exponent_length` bytes of modulated exponent, 2’s complement encoded so that negative values are correctly + ordered. +- Output `0x80 + leading signed byte of mantissa`, which is obtained by multiplying the mantissa by 100 and rounding to + -∞. The rounding is done so that the remainder of the mantissa becomes positive, and thus every new byte adds some + value to it, making shorter sequences lower in value. +- Update the mantissa to be the remainder after the rounding above. The result is guaranteed to be 0 or greater. +- While the mantissa is non-zero, output `0x80 + leading byte` as above and update the mantissa to be the remainder. +- Output `0x00`. + +As a description of how this produces the correct ordering, consider the result of comparison in the first differing +byte: +- Difference in the first byte can be caused by: + - Difference in sign of the number or being zero, which yields the correct ordering because + - Negative numbers start with `0x3c` - `0x44` + - Zero starts with `0x80` + - Positive numbers start with `0xbc` - `0xc4` + - Difference in sign of the exponent modulated with the sign of the number. In a positive number negative exponents + mean smaller values, while in a negative number it’s the opposite, thus the modulation with the number’s sign + ensures the correct ordering. + - Difference in modulated length of the exponent: again, since we gave the length a sign that is formed from both + the sign of the exponent and the sign of the number, smaller numbers mean smaller exponent in the positive number + case, and bigger exponent in the negative number case. In either case this provides the correct ordering. +- Difference in one of the bytes of the modulated exponent (whose length and sign are now equal for both compared + numbers): + - Smaller byte means a smaller modulated exponent. In the positive case this means a smaller exponent, thus a smaller + number. In the negative case this means the exponent is bigger, the absolute value of the number as well, and thus + the number is smaller. +- It is not possible for the difference to mix one number’s exponent with another’s mantissa (as such numbers would have + different leading bytes). +- Difference in a mantissa byte present in both inputs: + - Smaller byte means smaller signed mantissa and hence smaller number when the exponents are equal. +- One mantissa ending before another: + - This will result in the shorter being treated as smaller (since the trailing byte is `00`). + - Since all mantissas have at least one byte, this can’t happen in the leading mantissa byte. + - Thus the other number’s bytes from here on are not negative, and at least one of them must be non-zero, which means + its mantissa is bigger and thus it encodes a bigger number. + +Examples: + +|value|mexp|mantissa|mantissa in bytes|encodes as| +|---:|---:|---|---|---| +|1.1 | 1 | 0.0110 |. 01 10 | C1·01·81 8A·00 +|1 | 1 | 0.01 |. 01 | C1·01·81·00 +|0.01 | 0 | 0.01 |. 01 | C0·81·00 +|0 | | | | 80 +|-0.01 | 0 | -0.01 |. -01 | 40·81·00 +|-1 | -1 | -0.01 |. -01 | 3F·FF·7F·00 +|-1.1 | -1 | -0.0110|. -02 90 | 3F·FF·7E DA·00 +|-98.9 | -1 | -0.9890|. -99 10 | 3F·FF·1D 8A·00 +|-99 | -1 | -0.99 |. -99 | 3F·FF·1D·00 +|-99.9 | -1 | -0.9990|.-100 10 | 3F·FF·1C 8A·00 +|-8.1e2000 | -1001| -0.0810|. -09 90 | 3E·FC 17·77 DA·00 +|-8.1e-2000 | 999 | -0.0810|. -09 90 | 42·03 E7·77 DA·00 +|8.1e-2000 | -999 | 0.0810 |. 08 10 | BE·FC 19·88 8A·00 +|8.1e2000 | 1001 | 0.0810 |. 08 10 | C2·03 E9·88 8A·00 +(mexp stands for “modulated exponent”, i.e. exponent * sign) + +The values are prefix-free, because no exponent’s encoding can be a prefix of another, and the mantissas can never have +a `00` byte at any place other than the last byte, and thus all (1)-(4) are satisfied. + +## Nulls and empty encodings + +Some types in Cassandra (e.g. numbers) admit null values that are represented as empty byte buffers. This is +distinct from null byte buffers, which can also appear in some cases. Particularly, null values in clustering +columns, when allowed by the type, are interpreted as empty byte buffers, encoded with the empty separator `0x3F`. +Unspecified clustering columns (at the end of a clustering specification), possible with `COMPACT STORAGE` or secondary +indexes, use the null separator `0x3E`. + +## Reversed types + +Reversing a type is straightforward: flip all bits of the encoded byte sequence. Since the source type encoding must +satisfy (3) and (4), the flipped bits also do for the reversed comparator. (It is also true that if the source type +satisfies (1)-(2), the reversed will satisfy these too.) + +In a sequence we also must correct the empty encoding for a reversed type (since it must be greater than all values). +Instead of `0x3F` we use `0x41` as the separator byte. Null encodings are not modified, as nulls compare smaller even +in reversed types. + diff --git a/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java new file mode 100644 index 000000000000..be4cec0eae9e --- /dev/null +++ b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSource.java @@ -0,0 +1,853 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.utils.bytecomparable; + +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; + +import org.apache.cassandra.db.marshal.ValueAccessor; +import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version; +import org.apache.cassandra.utils.memory.MemoryUtil; + +import static com.google.common.base.Preconditions.checkArgument; + +/** + * A stream of bytes, used for byte-order-comparable representations of data, and utilities to convert various values + * to their byte-ordered translation. + * See ByteComparable.md for details about the encoding scheme. + */ +public interface ByteSource +{ + /** Consume the next byte, unsigned. Must be between 0 and 255, or END_OF_STREAM if there are no more bytes. */ + int next(); + + /** Value returned if at the end of the stream. */ + int END_OF_STREAM = -1; + + ByteSource EMPTY = () -> END_OF_STREAM; + + /** + * Escape value. Used, among other things, to mark the end of subcomponents (so that shorter compares before anything longer). + * Actual zeros in input need to be escaped if this is in use (see {@link AbstractEscaper}). + */ + int ESCAPE = 0x00; + + // Zeros are encoded as a sequence of ESCAPE, 0 or more of ESCAPED_0_CONT, ESCAPED_0_DONE so zeroed spaces only grow by 1 byte + int ESCAPED_0_CONT = 0xFE; + int ESCAPED_0_DONE = 0xFF; + + // All separators must be within these bounds + int MIN_SEPARATOR = 0x10; + int MAX_SEPARATOR = 0xEF; + + // Next component marker. + int NEXT_COMPONENT = 0x40; + // Marker used to present null values represented by empty buffers (e.g. by Int32Type) + int NEXT_COMPONENT_EMPTY = 0x3F; + int NEXT_COMPONENT_EMPTY_REVERSED = 0x41; + // Marker for null components in tuples, maps, sets and clustering keys. + int NEXT_COMPONENT_NULL = 0x3E; + + // Section for next component markers which is not allowed for use + int MIN_NEXT_COMPONENT = 0x3C; + int MAX_NEXT_COMPONENT = 0x44; + + // Default terminator byte in sequences. Smaller than NEXT_COMPONENT_NULL, but larger than LT_NEXT_COMPONENT to + // ensure lexicographic compares go in the correct direction + int TERMINATOR = 0x38; + // These are special endings, for exclusive/inclusive bounds (i.e. smaller than anything with more components, + // bigger than anything with more components) + int LT_NEXT_COMPONENT = 0x20; + int GT_NEXT_COMPONENT = 0x60; + + // Special value for components that should be excluded from the normal min/max span. (static rows) + int EXCLUDED = 0x18; + + /** + * Encodes byte-accessible data as a byte-comparable source that has 0s escaped and finishes in an escaped + * state. + * This provides a weakly-prefix-free byte-comparable version of the content to use in sequences. + * (See {@link AbstractEscaper} for a detailed explanation.) + */ + static ByteSource of(ValueAccessor accessor, V data, Version version) + { + return new AccessorEscaper<>(accessor, data, version); + } + + /** + * Encodes a byte buffer as a byte-comparable source that has 0s escaped and finishes in an escape. + * This provides a weakly-prefix-free byte-comparable version of the content to use in sequences. + * (See ByteSource.BufferEscaper/Multi for explanation.) + */ + static ByteSource of(ByteBuffer buf, Version version) + { + return new BufferEscaper(buf, version); + } + + /** + * Encodes a byte array as a byte-comparable source that has 0s escaped and finishes in an escape. + * This provides a prefix-free byte-comparable version of the content to use in sequences. + * (See ByteSource.BufferEscaper/Multi for explanation.) + */ + static ByteSource of(byte[] buf, Version version) + { + return new ArrayEscaper(buf, version); + } + + /** + * Encodes a memory range as a byte-comparable source that has 0s escaped and finishes in an escape. + * This provides a weakly-prefix-free byte-comparable version of the content to use in sequences. + * (See ByteSource.BufferEscaper/Multi for explanation.) + */ + static ByteSource ofMemory(long address, int length, ByteComparable.Version version) + { + return new MemoryEscaper(address, length, version); + } + + /** + * Combines a chain of sources, turning their weak-prefix-free byte-comparable representation into the combination's + * prefix-free byte-comparable representation, with the included terminator character. + * For correctness, the terminator must be within MIN-MAX_SEPARATOR and outside the range reserved for + * NEXT_COMPONENT markers. + * Typically TERMINATOR, or LT/GT_NEXT_COMPONENT if used for partially specified bounds. + */ + static ByteSource withTerminator(int terminator, ByteSource... srcs) + { + assert terminator >= MIN_SEPARATOR && terminator <= MAX_SEPARATOR; + assert terminator < MIN_NEXT_COMPONENT || terminator > MAX_NEXT_COMPONENT; + return new Multi(srcs, terminator); + } + + /** + * As above, but permits any separator. The legacy format wasn't using weak prefix freedom and has some + * non-reversible transformations. + */ + static ByteSource withTerminatorLegacy(int terminator, ByteSource... srcs) + { + return new Multi(srcs, terminator); + } + + static ByteSource withTerminatorMaybeLegacy(Version version, int legacyTerminator, ByteSource... srcs) + { + return version == Version.LEGACY ? withTerminatorLegacy(legacyTerminator, srcs) + : withTerminator(TERMINATOR, srcs); + } + + static ByteSource of(String s, Version version) + { + return new ArrayEscaper(s.getBytes(StandardCharsets.UTF_8), version); + } + + static ByteSource of(long value) + { + return new Number(value ^ (1L<<63), 8); + } + + static ByteSource of(int value) + { + return new Number(value ^ (1L<<31), 4); + } + + /** + * Produce a source for a signed fixed-length number, also translating empty to null. + * The first byte has its sign bit inverted, and the rest are passed unchanged. + * Presumes that the length of the buffer is always either 0 or constant for the type, which permits decoding and + * ensures the representation is prefix-free. + */ + static ByteSource optionalSignedFixedLengthNumber(ValueAccessor accessor, V data) + { + return !accessor.isEmpty(data) ? signedFixedLengthNumber(accessor, data) : null; + } + + /** + * Produce a source for a signed fixed-length number. + * The first byte has its sign bit inverted, and the rest are passed unchanged. + * Presumes that the length of the buffer is always constant for the type. + */ + static ByteSource signedFixedLengthNumber(ValueAccessor accessor, V data) + { + return new SignedFixedLengthNumber<>(accessor, data); + } + + /** + * Produce a source for a signed fixed-length floating-point number, also translating empty to null. + * If sign bit is on, returns negated bytes. If not, add the sign bit value. + * (Sign of IEEE floats is the highest bit, the rest can be compared in magnitude by byte comparison.) + * Presumes that the length of the buffer is always either 0 or constant for the type, which permits decoding and + * ensures the representation is prefix-free. + */ + static ByteSource optionalSignedFixedLengthFloat(ValueAccessor accessor, V data) + { + return !accessor.isEmpty(data) ? signedFixedLengthFloat(accessor, data) : null; + } + + /** + * Produce a source for a signed fixed-length floating-point number. + * If sign bit is on, returns negated bytes. If not, add the sign bit value. + * (Sign of IEEE floats is the highest bit, the rest can be compared in magnitude by byte comparison.) + * Presumes that the length of the buffer is always constant for the type. + */ + static ByteSource signedFixedLengthFloat(ValueAccessor accessor, V data) + { + return new SignedFixedLengthFloat<>(accessor, data); + } + + /** + * Produce a source for a signed integer, stored using variable length encoding. + * The representation uses between 1 and 9 bytes, is prefix-free and compares + * correctly. + */ + static ByteSource variableLengthInteger(long value) + { + return new VariableLengthInteger(value); + } + + /** + * Returns a separator for two byte sources, i.e. something that is definitely > prevMax, and <= currMin, assuming + * prevMax < currMin. + * This returns the shortest prefix of currMin that is greater than prevMax. + */ + public static ByteSource separatorPrefix(ByteSource prevMax, ByteSource currMin) + { + return new Separator(prevMax, currMin, true); + } + + /** + * Returns a separator for two byte sources, i.e. something that is definitely > prevMax, and <= currMin, assuming + * prevMax < currMin. + * This is a source of length 1 longer than the common prefix of the two sources, with last byte one higher than the + * prevMax source. + */ + public static ByteSource separatorGt(ByteSource prevMax, ByteSource currMin) + { + return new Separator(prevMax, currMin, false); + } + + public static ByteSource oneByte(int i) + { + assert i >= 0 && i <= 0xFF : "Argument must be a valid unsigned byte."; + return new ByteSource() + { + boolean consumed = false; + + @Override + public int next() + { + if (consumed) + return END_OF_STREAM; + consumed = true; + return i; + } + }; + } + + public static ByteSource cut(ByteSource src, int cutoff) + { + return new ByteSource() + { + int pos = 0; + + @Override + public int next() + { + return pos++ < cutoff ? src.next() : END_OF_STREAM; + } + }; + } + + /** + * Wrap a ByteSource in a length-fixing facade. + * + * If the length of {@code src} is less than {@code cutoff}, then pad it on the right with {@code padding} until + * the overall length equals {@code cutoff}. If the length of {@code src} is greater than {@code cutoff}, then + * truncate {@code src} to that size. Effectively a noop if {@code src} happens to have length {@code cutoff}. + * + * @param src the input source to wrap + * @param cutoff the size of the source returned + * @param padding a padding byte (an int subject to a 0xFF mask) + */ + public static ByteSource cutOrRightPad(ByteSource src, int cutoff, int padding) + { + return new ByteSource() + { + int pos = 0; + + @Override + public int next() + { + if (pos++ >= cutoff) + { + return END_OF_STREAM; + } + int next = src.next(); + return next == END_OF_STREAM ? padding : next; + } + }; + } + + + /** + * Variable-length encoding. Escapes 0s as ESCAPE + zero or more ESCAPED_0_CONT + ESCAPED_0_DONE. + * If the source ends in 0, we use ESCAPED_0_CONT to make sure that the encoding remains smaller than that source + * with a further 0 at the end. + * Finishes in an escaped state (either with ESCAPE or ESCAPED_0_CONT), which in {@link Multi} is followed by + * a component separator between 0x10 and 0xFE. + * + * E.g. "A\0\0B" translates to 4100FEFF4200 + * "A\0B\0" 4100FF4200FE (+00 for {@link Version#LEGACY}) + * "A\0" 4100FE (+00 for {@link Version#LEGACY}) + * "AB" 414200 + * + * If in a single byte source, the bytes could be simply passed unchanged, but this would not allow us to + * combine components. This translation preserves order, and since the encoding for 0 is higher than the separator + * also makes sure shorter components are treated as smaller. + * + * The encoding is not prefix-free, since e.g. the encoding of "A" (4100) is a prefix of the encoding of "A\0" + * (4100FE), but the byte following the prefix is guaranteed to be FE or FF, which makes the encoding weakly + * prefix-free. Additionally, any such prefix sequence will compare smaller than the value to which it is a prefix, + * because any permitted separator byte will be smaller than the byte following the prefix. + */ + abstract static class AbstractEscaper implements ByteSource + { + private final Version version; + private int bufpos; + private boolean escaped; + + AbstractEscaper(int position, Version version) + { + this.bufpos = position; + this.version = version; + } + + @Override + public final int next() + { + if (bufpos >= limit()) + { + if (bufpos > limit()) + return END_OF_STREAM; + + ++bufpos; + if (escaped) + { + escaped = false; + if (version == Version.LEGACY) + --bufpos; // place an ESCAPE at the end of sequence ending in ESCAPE + return ESCAPED_0_CONT; + } + return ESCAPE; + } + + int index = bufpos++; + int b = get(index) & 0xFF; + if (!escaped) + { + if (b == ESCAPE) + escaped = true; + return b; + } + else + { + if (b == ESCAPE) + return ESCAPED_0_CONT; + --bufpos; + escaped = false; + return ESCAPED_0_DONE; + } + } + + protected abstract byte get(int index); + + protected abstract int limit(); + } + + static class AccessorEscaper extends AbstractEscaper + { + private final V data; + private final ValueAccessor accessor; + + private AccessorEscaper(ValueAccessor accessor, V data, Version version) + { + super(0, version); + this.accessor = accessor; + this.data = data; + } + + protected int limit() + { + return accessor.size(data); + } + + protected byte get(int index) + { + return accessor.getByte(data, index); + } + } + + static class BufferEscaper extends AbstractEscaper + { + private final ByteBuffer buf; + + private BufferEscaper(ByteBuffer buf, Version version) + { + super(buf.position(), version); + this.buf = buf; + } + + protected int limit() + { + return buf.limit(); + } + + protected byte get(int index) + { + return buf.get(index); + } + } + + static class ArrayEscaper extends AbstractEscaper + { + private final byte[] buf; + + private ArrayEscaper(byte[] buf, Version version) + { + super(0, version); + this.buf = buf; + } + + @Override + protected byte get(int index) + { + return buf[index]; + } + + @Override + protected int limit() + { + return buf.length; + } + } + + static class MemoryEscaper extends AbstractEscaper + { + private final long address; + private final int length; + + MemoryEscaper(long address, int length, ByteComparable.Version version) + { + super(0, version); + this.address = address; + this.length = length; + } + + protected byte get(int index) + { + return MemoryUtil.getByte(address + index); + } + + protected int limit() + { + return length; + } + } + + /** + * Fixed length signed number encoding. Inverts first bit (so that neg < pos), then just posts all bytes from the + * buffer. Assumes buffer is of correct length. + */ + static class SignedFixedLengthNumber implements ByteSource + { + private final ValueAccessor accessor; + private final V data; + private int bufpos; + + public SignedFixedLengthNumber(ValueAccessor accessor, V data) + { + this.accessor = accessor; + this.data = data; + this.bufpos = 0; + } + + @Override + public int next() + { + if (bufpos >= accessor.size(data)) + return END_OF_STREAM; + int v = accessor.getByte(data, bufpos) & 0xFF; + if (bufpos == 0) + v ^= 0x80; + ++bufpos; + return v; + } + } + + /** + * Variable-length encoding for unsigned integers. + * The encoding is similar to UTF-8 encoding. + * Numbers between 0 and 127 are encoded in one byte, using 0 in the most significant bit. + * Larger values have 1s in as many of the most significant bits as the number of additional bytes + * in the representation, followed by a 0. This ensures that longer numbers compare larger than shorter + * ones. Since we never use a longer representation than necessary, this implies numbers compare correctly. + * As the number of bytes is specified in the bits of the first, no value is a prefix of another. + */ + static class VariableLengthUnsignedInteger implements ByteSource + { + private final long value; + private int pos = -1; + + public VariableLengthUnsignedInteger(long value) + { + this.value = value; + } + + @Override + public int next() + { + if (pos == -1) + { + int bitsMinusOne = 63 - (Long.numberOfLeadingZeros(value | 1)); // 0 to 63 (the | 1 is to make sure 0 maps to 0 (1 bit)) + int bytesMinusOne = bitsMinusOne / 7; + int mask = -256 >> bytesMinusOne; // sequence of bytesMinusOne 1s in the most-significant bits + pos = bytesMinusOne * 8; + return (int) ((value >>> pos) | mask) & 0xFF; + } + pos -= 8; + if (pos < 0) + return END_OF_STREAM; + return (int) (value >>> pos) & 0xFF; + } + } + + /** + * Variable-length encoding for signed integers. + * The encoding is based on the unsigned encoding above, where the first bit stored is the inverted sign, + * followed by as many matching bits as there are additional bytes in the encoding, followed by the two's + * complement of the number. + * Because of the inverted sign bit, negative numbers compare smaller than positives, and because the length + * bits match the sign, longer positive numbers compare greater and longer negative ones compare smaller. + * + * Examples: + * 0 encodes as 80 + * 1 encodes as 81 + * -1 encodes as 7F + * 63 encodes as BF + * 64 encodes as C040 + * -64 encodes as 40 + * -65 encodes as 3FBF + * 2^20-1 encodes as EFFFFF + * 2^20 encodes as F0100000 + * -2^20 encodes as 100000 + * 2^64-1 encodes as FFFFFFFFFFFFFFFFFF + * -2^64 encodes as 000000000000000000 + * + * As the number of bytes is specified in bits 2-9, no value is a prefix of another. + */ + static class VariableLengthInteger implements ByteSource + { + private final long value; + private int pos; + + public VariableLengthInteger(long value) + { + long negativeMask = value >> 63; // -1 for negative, 0 for positive + value ^= negativeMask; + + int bits = 64 - Long.numberOfLeadingZeros(value | 1); // 1 to 63 (can't be 64 because we flip negative numbers) + int bytes = bits / 7 + 1; // 0-6 bits 1 byte 7-13 2 bytes etc to 56-63 9 bytes + if (bytes >= 9) + { + value |= 0x8000000000000000L; // 8th bit, which doesn't fit the first byte + pos = negativeMask < 0 ? 256 : -1; // out of 0-64 range integer such that & 0xFF is 0x00 for negative and 0xFF for positive + } + else + { + long mask = (-0x100 >> bytes) & 0xFF; // one in sign bit and as many more as there are extra bytes + pos = bytes * 8; + value = value | (mask << (pos - 8)); + } + + value ^= negativeMask; + this.value = value; + } + + @Override + public int next() + { + if (pos <= 0 || pos > 64) + { + if (pos == 0) + return END_OF_STREAM; + else + { + // 8-byte value, returning first byte + int result = pos & 0xFF; // 0x00 for negative numbers, 0xFF for positive + pos = 64; + return result; + } + } + pos -= 8; + return (int) (value >>> pos) & 0xFF; + } + } + + static class Number implements ByteSource + { + private final long value; + private int pos; + + public Number(long value, int length) + { + this.value = value; + this.pos = length; + } + + @Override + public int next() + { + if (pos == 0) + return END_OF_STREAM; + return (int) ((value >> (--pos * 8)) & 0xFF); + } + } + + /** + * Fixed length signed floating point number encoding. First bit is sign. If positive, add sign bit value to make + * greater than all negatives. If not, invert all content to make negatives with bigger magnitude smaller. + */ + static class SignedFixedLengthFloat implements ByteSource + { + private final ValueAccessor accessor; + private final V data; + private int bufpos; + private boolean invert; + + public SignedFixedLengthFloat(ValueAccessor accessor, V data) + { + this.accessor = accessor; + this.data = data; + this.bufpos = 0; + } + + @Override + public int next() + { + if (bufpos >= accessor.size(data)) + return END_OF_STREAM; + int v = accessor.getByte(data, bufpos) & 0xFF; + if (bufpos == 0) + { + invert = v >= 0x80; + v |= 0x80; + } + if (invert) + v = v ^ 0xFF; + ++bufpos; + return v; + } + } + + /** + * Combination of multiple byte sources. Adds {@link NEXT_COMPONENT} before sources, or {@link NEXT_COMPONENT_NULL} if next is null. + */ + static class Multi implements ByteSource + { + private final ByteSource[] srcs; + private int srcnum = -1; + private final int sequenceTerminator; + + Multi(ByteSource[] srcs, int sequenceTerminator) + { + this.srcs = srcs; + this.sequenceTerminator = sequenceTerminator; + } + + @Override + public int next() + { + if (srcnum == srcs.length) + return END_OF_STREAM; + + int b = END_OF_STREAM; + if (srcnum >= 0 && srcs[srcnum] != null) + b = srcs[srcnum].next(); + if (b > END_OF_STREAM) + return b; + + ++srcnum; + if (srcnum == srcs.length) + return sequenceTerminator; + if (srcs[srcnum] == null) + return NEXT_COMPONENT_NULL; + return NEXT_COMPONENT; + } + } + + /** + * Construct the shortest common prefix of prevMax and currMin that separates those two byte streams. + * If {@code useCurr == true} the last byte of the returned stream comes from {@code currMin} and is the first + * byte which is greater than byte on the corresponding position of {@code prevMax}. + * Otherwise, the last byte of the returned stream comes from {@code prevMax} and is incremented by one, still + * guaranteeing that it is <= than the byte on the corresponding position of {@code currMin}. + */ + static class Separator implements ByteSource + { + private final ByteSource prev; + private final ByteSource curr; + private boolean done = false; + private final boolean useCurr; + + Separator(ByteSource prevMax, ByteSource currMin, boolean useCurr) + { + this.prev = prevMax; + this.curr = currMin; + this.useCurr = useCurr; + } + + @Override + public int next() + { + if (done) + return END_OF_STREAM; + int p = prev.next(); + int c = curr.next(); + assert p <= c : prev + " not less than " + curr; + if (p == c) + return c; + done = true; + return useCurr ? c : p + 1; + } + } + + static ByteSource optionalFixedLength(ValueAccessor accessor, V data) + { + return !accessor.isEmpty(data) ? fixedLength(accessor, data) : null; + } + + /** + * A byte source of the given bytes without any encoding. + * The resulting source is only guaranteed to give correct comparison results and be prefix-free if the + * underlying type has a fixed length. + * In tests, this method is also used to generate non-escaped test cases. + */ + public static ByteSource fixedLength(ValueAccessor accessor, V data) + { + return new ByteSource() + { + int pos = -1; + + @Override + public int next() + { + return ++pos < accessor.size(data) ? accessor.getByte(data, pos) & 0xFF : END_OF_STREAM; + } + }; + } + + /** + * A byte source of the given bytes without any encoding. + * The resulting source is only guaranteed to give correct comparison results and be prefix-free if the + * underlying type has a fixed length. + * In tests, this method is also used to generate non-escaped test cases. + */ + public static ByteSource fixedLength(ByteBuffer b) + { + return new ByteSource() + { + int pos = b.position() - 1; + + @Override + public int next() + { + return ++pos < b.limit() ? b.get(pos) & 0xFF : END_OF_STREAM; + } + }; + } + + /** + * A byte source of the given bytes without any encoding. + * If used in a sequence, the resulting source is only guaranteed to give correct comparison results if the + * underlying type has a fixed length. + * In tests, this method is also used to generate non-escaped test cases. + */ + public static ByteSource fixedLength(byte[] b) + { + return fixedLength(b, 0, b.length); + } + + public static ByteSource fixedLength(byte[] b, int offset, int length) + { + checkArgument(offset >= 0 && offset <= b.length); + checkArgument(length >= 0 && offset + length <= b.length); + + return new ByteSource() + { + int pos = offset - 1; + + @Override + public int next() + { + return ++pos < offset + length ? b[pos] & 0xFF : END_OF_STREAM; + } + }; + } + + public class Peekable implements ByteSource + { + private static final int NONE = Integer.MIN_VALUE; + + private final ByteSource wrapped; + private int peeked = NONE; + + public Peekable(ByteSource wrapped) + { + this.wrapped = wrapped; + } + + @Override + public int next() + { + if (peeked != NONE) + { + int val = peeked; + peeked = NONE; + return val; + } + else + return wrapped.next(); + } + + public int peek() + { + if (peeked == NONE) + peeked = wrapped.next(); + return peeked; + } + } + + public static Peekable peekable(ByteSource p) + { + // When given a null source, we're better off not wrapping it and just returning null. This way existing + // code that doesn't know about ByteSource.Peekable, but handles correctly null ByteSources won't be thrown + // off by a non-null instance that semantically should have been null. + if (p == null) + return null; + return (p instanceof Peekable) + ? (Peekable) p + : new Peekable(p); + } +} diff --git a/src/java/org/apache/cassandra/utils/bytecomparable/ByteSourceInverse.java b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSourceInverse.java new file mode 100644 index 000000000000..16b66798c234 --- /dev/null +++ b/src/java/org/apache/cassandra/utils/bytecomparable/ByteSourceInverse.java @@ -0,0 +1,471 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.utils.bytecomparable; + +import java.nio.charset.StandardCharsets; +import java.util.Arrays; + +import com.google.common.base.Preconditions; + +import org.apache.cassandra.db.marshal.ValueAccessor; + +/** + * Contains inverse transformation utilities for {@link ByteSource}s. + * + * See ByteComparable.md for details about the encoding scheme. + */ +public final class ByteSourceInverse +{ + private static final int INITIAL_BUFFER_CAPACITY = 32; + private static final int BYTE_ALL_BITS = 0xFF; + private static final int BYTE_NO_BITS = 0x00; + private static final int BYTE_SIGN_BIT = 1 << 7; + private static final int SHORT_SIGN_BIT = 1 << 15; + private static final int INT_SIGN_BIT = 1 << 31; + private static final long LONG_SIGN_BIT = 1L << 63; + + /** + * Consume the given number of bytes and produce a long from them, effectively treating the bytes as a big-endian + * unsigned encoding of the number. + */ + public static long getUnsignedFixedLengthAsLong(ByteSource byteSource, int length) + { + Preconditions.checkNotNull(byteSource); + Preconditions.checkArgument(length >= 1 && length <= 8, "Between 1 and 8 bytes can be read at a time"); + + long result = 0; + for (int i = 0; i < length; ++i) + result = (result << 8) | getAndCheckByte(byteSource, i, length); // note: this must use the unsigned byte value + + return result; + } + + /** + * Produce the bytes for an encoded signed fixed-length number. + * The first byte has its sign bit inverted, and the rest are passed unchanged. + */ + public static V getSignedFixedLength(ValueAccessor accessor, ByteSource byteSource, int length) + { + Preconditions.checkNotNull(byteSource); + Preconditions.checkArgument(length >= 1, "At least 1 byte should be read"); + + V result = accessor.allocate(length); + // The first byte needs to have its sign flipped + accessor.putByte(result, 0, (byte) (getAndCheckByte(byteSource, 0, length) ^ BYTE_SIGN_BIT)); + // and the rest can be retrieved unchanged. + for (int i = 1; i < length; ++i) + accessor.putByte(result, i, (byte) getAndCheckByte(byteSource, i, length)); + return result; + } + + /** + * Produce the bytes for an encoded signed fixed-length number, also translating null to empty buffer. + * The first byte has its sign bit inverted, and the rest are passed unchanged. + */ + public static V getOptionalSignedFixedLength(ValueAccessor accessor, ByteSource byteSource, int length) + { + return byteSource == null ? accessor.empty() : getSignedFixedLength(accessor, byteSource, length); + } + + /** + * Produce the bytes for an encoded signed fixed-length floating-point number. + * If sign bit is on, returns negated bytes. If not, clears the sign bit and passes the rest of the bytes unchanged. + */ + public static V getSignedFixedLengthFloat(ValueAccessor accessor, ByteSource byteSource, int length) + { + Preconditions.checkNotNull(byteSource); + Preconditions.checkArgument(length >= 1, "At least 1 byte should be read"); + + V result = accessor.allocate(length); + + int xor; + int first = getAndCheckByte(byteSource, 0, length); + if (first < 0x80) + { + // Negative number. Invert all bits. + xor = BYTE_ALL_BITS; + first ^= xor; + } + else + { + // Positive number. Invert only the sign bit. + xor = BYTE_NO_BITS; + first ^= BYTE_SIGN_BIT; + } + accessor.putByte(result, 0, (byte) first); + + // xor is now applied to the rest of the bytes to flip their bits if necessary. + for (int i = 1; i < length; ++i) + accessor.putByte(result, i, (byte) (getAndCheckByte(byteSource, i, length) ^ xor)); + + return result; + } + + /** + * Produce the bytes for an encoded signed fixed-length floating-point number, also translating null to an empty + * buffer. + * If sign bit is on, returns negated bytes. If not, clears the sign bit and passes the rest of the bytes unchanged. + */ + public static V getOptionalSignedFixedLengthFloat(ValueAccessor accessor, ByteSource byteSource, int length) + { + return byteSource == null ? accessor.empty() : getSignedFixedLengthFloat(accessor, byteSource, length); + } + + /** + * Consume the next length bytes from the source unchanged. + */ + public static V getFixedLength(ValueAccessor accessor, ByteSource byteSource, int length) + { + Preconditions.checkNotNull(byteSource); + Preconditions.checkArgument(length >= 1, "At least 1 byte should be read"); + + V result = accessor.allocate(length); + for (int i = 0; i < length; ++i) + accessor.putByte(result, i, (byte) getAndCheckByte(byteSource, i, length)); + return result; + } + + /** + * Consume the next length bytes from the source unchanged, also translating null to an empty buffer. + */ + public static V getOptionalFixedLength(ValueAccessor accessor, ByteSource byteSource, int length) + { + return byteSource == null ? accessor.empty() : getFixedLength(accessor, byteSource, length); + } + + /** + * Consume the next {@code int} from the current position of the given {@link ByteSource}. The source position is + * modified accordingly (moved 4 bytes forward). + *

    + * The source is not strictly required to represent just the encoding of an {@code int} value, so theoretically + * this API could be used for reading data in 4-byte strides. Nevertheless its usage is fairly limited because: + *

      + *
    1. ...it presupposes signed fixed-length encoding for the encoding of the original value
    2. + *
    3. ...it decodes the data returned on each stride as an {@code int} (i.e. it inverts its leading bit)
    4. + *
    5. ...it doesn't provide any meaningful guarantees (with regard to throwing) in case there are not enough + * bytes to read, in case a special escape value was not interpreted as such, etc.
    6. + *
    + *

    + * + * @param byteSource A non-null byte source, containing at least 4 bytes. + */ + public static int getSignedInt(ByteSource byteSource) + { + return (int) getUnsignedFixedLengthAsLong(byteSource, 4) ^ INT_SIGN_BIT; + } + + /** + * Consume the next {@code long} from the current position of the given {@link ByteSource}. The source position is + * modified accordingly (moved 8 bytes forward). + *

    + * The source is not strictly required to represent just the encoding of a {@code long} value, so theoretically + * this API could be used for reading data in 8-byte strides. Nevertheless its usage is fairly limited because: + *

      + *
    1. ...it presupposes signed fixed-length encoding for the encoding of the original value
    2. + *
    3. ...it decodes the data returned on each stride as a {@code long} (i.e. it inverts its leading bit)
    4. + *
    5. ...it doesn't provide any meaningful guarantees (with regard to throwing) in case there are not enough + * bytes to read, in case a special escape value was not interpreted as such, etc.
    6. + *
    + *

    + * + * @param byteSource A non-null byte source, containing at least 8 bytes. + */ + public static long getSignedLong(ByteSource byteSource) + { + return getUnsignedFixedLengthAsLong(byteSource, 8) ^ LONG_SIGN_BIT; + } + + /** + * Converts the given {@link ByteSource} to a {@code byte}. + * + * @param byteSource A non-null byte source, containing at least 1 byte. + */ + public static byte getSignedByte(ByteSource byteSource) + { + return (byte) (getAndCheckByte(Preconditions.checkNotNull(byteSource), 0, 1) ^ BYTE_SIGN_BIT); + } + + /** + * Converts the given {@link ByteSource} to a {@code short}. All terms and conditions valid for + * {@link #getSignedInt(ByteSource)} and {@link #getSignedLong(ByteSource)} translate to this as well. + * + * @param byteSource A non-null byte source, containing at least 2 bytes. + * + * @see #getSignedInt(ByteSource) + * @see #getSignedLong(ByteSource) + */ + public static short getSignedShort(ByteSource byteSource) + { + return (short) (getUnsignedFixedLengthAsLong(byteSource, 2) ^ SHORT_SIGN_BIT); + } + + /** + * Decode a variable-length signed integer. + */ + public static long getVariableLengthInteger(ByteSource byteSource) + { + int signAndMask = getAndCheckByte(byteSource); + + long sum = 0; + int bytes; + // For every bit after the sign that matches the sign, read one more byte. + for (bytes = 0; bytes < 7 && sameByteSign(signAndMask << (bytes + 1), signAndMask); ++bytes) + sum = (sum << 8) | getAndCheckByte(byteSource); + + // The eighth length bit is stored in the second byte. + if (bytes == 7 && sameByteSign((int) (sum >> 48), signAndMask)) + return ((sum << 8) | getAndCheckByte(byteSource)) ^ LONG_SIGN_BIT; // 9-byte encoding, use bytes 2-9 with inverted sign + else + { + sum |= (((long) signAndMask) << bytes * 8); // add the rest of the bits + long signMask = -0x40L << bytes * 7; // mask of the bits that should be replaced by the sign + long sign = (byte) (signAndMask ^ 0x80) >> 7; // -1 if negative (0 leading bit), 0 otherwise + return sum & ~signMask | sign & signMask; + } + } + + /** + * Decode a variable-length unsigned integer, passing all bytes read through XOR with the given xorWith parameter. + * + * Used in BigInteger encoding to read number length, where negative numbers have their length negated + * (i.e. xorWith = 0xFF) to ensure correct ordering. + */ + public static long getVariableLengthUnsignedIntegerXoring(ByteSource byteSource, int xorWith) + { + int signAndMask = getAndCheckByte(byteSource) ^ xorWith; + + long sum = 0; + int bytes; + // Read an extra byte while the next most significant bit is 1. + for (bytes = 0; bytes <= 7 && ((signAndMask << bytes) & 0x80) != 0; ++bytes) + sum = (sum << 8) | getAndCheckByte(byteSource) ^ xorWith; + + // Strip the length bits from the leading byte. + signAndMask &= ~(-256 >> bytes); + return sum | (((long) signAndMask) << bytes * 8); // Add the rest of the bits of the leading byte. + } + + /** Returns true if the two parameters treated as bytes have the same sign. */ + private static boolean sameByteSign(int a, int b) + { + return ((a ^ b) & 0x80) == 0; + } + + + private static int getAndCheckByte(ByteSource byteSource) + { + return getAndCheckByte(byteSource, -1, -1); + } + + private static int getAndCheckByte(ByteSource byteSource, int pos, int length) + { + int data = byteSource.next(); + if (data == ByteSource.END_OF_STREAM) + throw new IllegalArgumentException( + length > 0 ? String.format("Unexpected end of stream reached after %d bytes (expected >= %d)", pos, length) + : "Unexpected end of stream"); + assert data >= BYTE_NO_BITS && data <= BYTE_ALL_BITS + : "A ByteSource must produce unsigned bytes and end in END_OF_STREAM"; + return data; + } + + /** + * Reads a single variable-length byte sequence (blob, string, ...) encoded according to the scheme described + * in ByteComparable.md, decoding it back to its original, unescaped form. + * + * @param byteSource The source of the variable-length bytes sequence. + * @return A byte array containing the original, unescaped bytes of the given source. Unescaped here means + * not including any of the escape sequences of the encoding scheme used for variable-length byte sequences. + */ + public static byte[] getUnescapedBytes(ByteSource.Peekable byteSource) + { + return byteSource == null ? null : readBytes(unescape(byteSource)); + } + + /** + * As above, but converts the result to a ByteSource. + */ + public static ByteSource unescape(ByteSource.Peekable byteSource) + { + return new ByteSource() { + boolean escaped = false; + + @Override + public int next() + { + if (!escaped) + { + int data = byteSource.next(); // we consume this byte no matter what it is + if (data > ByteSource.ESCAPE) + return data; // most used path leads here + + assert data != ByteSource.END_OF_STREAM : "Invalid escaped byte sequence"; + escaped = true; + } + + int next = byteSource.peek(); + switch (next) + { + case END_OF_STREAM: + // The end of a byte-comparable outside of a multi-component sequence. No matter what we have + // seen or peeked before, we should stop now. + byteSource.next(); + return END_OF_STREAM; + case ESCAPED_0_DONE: + // The end of 1 or more consecutive 0x00 value bytes. + escaped = false; + byteSource.next(); + return ESCAPE; + case ESCAPED_0_CONT: + // Escaped sequence continues + byteSource.next(); + return ESCAPE; + default: + // An ESCAPE or ESCAPED_0_CONT won't be followed by either another ESCAPED_0_CONT, an + // ESCAPED_0_DONE, or an END_OF_STREAM only when the byte-comparable is part of a multi-component + // sequence and we have reached the end of the encoded byte-comparable. In this case, the byte + // we have just peeked is the separator or terminator byte between or at the end of components + // (which by contact must be 0x10 - 0xFE, which cannot conflict with our special bytes). + assert next >= ByteSource.MIN_SEPARATOR && next <= ByteSource.MAX_SEPARATOR : next; + // Unlike above, we don't consume this byte (the sequence decoding needs it). + return END_OF_STREAM; + } + } + }; + } + + /** + * Reads the bytes of the given source into a byte array. Doesn't do any transformation on the bytes, just reads + * them until it reads an {@link ByteSource#END_OF_STREAM} byte, after which it returns an array of all the read + * bytes, excluding the {@link ByteSource#END_OF_STREAM}. + *

    + * This method sizes a tentative internal buffer array at {@code initialBufferCapacity}. However, if + * {@code byteSource} exceeds this size, the buffer array is recreated with doubled capacity as many times as + * necessary. If, after {@code byteSource} is fully exhausted, the number of bytes read from it does not exactly + * match the current size of the tentative buffer array, then it is copied into another array sized to fit the + * number of bytes read; otherwise, it is returned without that final copy step. + * + * @param byteSource The source which bytes we're interested in. + * @param initialBufferCapacity The initial size of the internal buffer. + * @return A byte array containing exactly all the read bytes. In case of a {@code null} source, the returned byte + * array will be empty. + */ + public static byte[] readBytes(ByteSource byteSource, final int initialBufferCapacity) + { + Preconditions.checkNotNull(byteSource); + + int readBytes = 0; + byte[] buf = new byte[initialBufferCapacity]; + int data; + while ((data = byteSource.next()) != ByteSource.END_OF_STREAM) + { + buf = ensureCapacity(buf, readBytes); + buf[readBytes++] = (byte) data; + } + + if (readBytes != buf.length) + { + buf = Arrays.copyOf(buf, readBytes); + } + return buf; + } + + /** + * Reads the bytes of the given source into a byte array. Doesn't do any transformation on the bytes, just reads + * them until it reads an {@link ByteSource#END_OF_STREAM} byte, after which it returns an array of all the read + * bytes, excluding the {@link ByteSource#END_OF_STREAM}. + *

    + * This is equivalent to {@link #readBytes(ByteSource, int)} where the second actual parameter is + * {@linkplain #INITIAL_BUFFER_CAPACITY} ({@value INITIAL_BUFFER_CAPACITY}). + * + * @param byteSource The source which bytes we're interested in. + * @return A byte array containing exactly all the read bytes. In case of a {@code null} source, the returned byte + * array will be empty. + */ + public static byte[] readBytes(ByteSource byteSource) + { + return readBytes(byteSource, INITIAL_BUFFER_CAPACITY); + } + + /** + * Ensures the given buffer has capacity for taking data with the given length - if it doesn't, it returns a copy + * of the buffer, but with double the capacity. + */ + private static byte[] ensureCapacity(byte[] buf, int dataLengthInBytes) + { + if (dataLengthInBytes == buf.length) + // We won't gain much with guarding against overflow. We'll overflow when dataLengthInBytes >= 1 << 30, + // and if we do guard, we'll be able to extend the capacity to Integer.MAX_VALUE (which is 1 << 31 - 1). + // Controlling the exception that will be thrown shouldn't matter that much, and in practice, we almost + // surely won't be reading gigabytes of ByteSource data at once. + return Arrays.copyOf(buf, dataLengthInBytes * 2); + else + return buf; + } + + /** + * Converts the given {@link ByteSource} to a UTF-8 {@link String}. + * + * @param byteSource The source we're interested in. + * @return A UTF-8 string corresponding to the given source. + */ + public static String getString(ByteSource.Peekable byteSource) + { + if (byteSource == null) + return null; + + byte[] data = getUnescapedBytes(byteSource); + + return new String(data, StandardCharsets.UTF_8); + } + + /* + * Multi-component sequence utilities. + */ + + /** + * A utility for consuming components from a peekable multi-component sequence. + * It uses the component separators, so the given sequence needs to have its last component fully consumed, in + * order for the next consumable byte to be a separator. Identifying the end of the component that will then be + * consumed is the responsibility of the consumer (the user of this method). + * @param source A peekable multi-component sequence, which next byte is a component separator. + * @return the given multi-component sequence if its next component is not null, or {@code null} if it is. + */ + public static ByteSource.Peekable nextComponentSource(ByteSource.Peekable source) + { + return nextComponentSource(source, source.next()); + } + + /** + * A utility for consuming components from a peekable multi-component sequence, very similar to + * {@link #nextComponentSource(ByteSource.Peekable)} - the difference being that here the separator can be passed + * in case it had to be consumed beforehand. + */ + public static ByteSource.Peekable nextComponentSource(ByteSource.Peekable source, int separator) + { + return nextComponentNull(separator) + ? null + : source; + } + + public static boolean nextComponentNull(int separator) + { + return separator == ByteSource.NEXT_COMPONENT_NULL || separator == ByteSource.NEXT_COMPONENT_EMPTY + || separator == ByteSource.NEXT_COMPONENT_EMPTY_REVERSED; + } +} diff --git a/test/microbench/org/apache/cassandra/test/microbench/AbstractTypeByteSourceDecodingBench.java b/test/microbench/org/apache/cassandra/test/microbench/AbstractTypeByteSourceDecodingBench.java new file mode 100644 index 000000000000..427265ec9eab --- /dev/null +++ b/test/microbench/org/apache/cassandra/test/microbench/AbstractTypeByteSourceDecodingBench.java @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.test.microbench; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.HashMap; +import java.util.Map; +import java.util.Random; +import java.util.concurrent.TimeUnit; +import java.util.function.BiFunction; + +import net.nicoulaj.compilecommand.annotations.Inline; +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.BytesType; +import org.apache.cassandra.db.marshal.DecimalType; +import org.apache.cassandra.db.marshal.IntegerType; +import org.apache.cassandra.db.marshal.TypeParser; +import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; +import org.openjdk.jmh.annotations.Benchmark; +import org.openjdk.jmh.annotations.BenchmarkMode; +import org.openjdk.jmh.annotations.Fork; +import org.openjdk.jmh.annotations.Level; +import org.openjdk.jmh.annotations.Measurement; +import org.openjdk.jmh.annotations.Mode; +import org.openjdk.jmh.annotations.OutputTimeUnit; +import org.openjdk.jmh.annotations.Param; +import org.openjdk.jmh.annotations.Scope; +import org.openjdk.jmh.annotations.Setup; +import org.openjdk.jmh.annotations.State; +import org.openjdk.jmh.annotations.Threads; +import org.openjdk.jmh.annotations.Warmup; + +@BenchmarkMode(Mode.AverageTime) +@OutputTimeUnit(TimeUnit.MICROSECONDS) +@Warmup(iterations = 5, time = 1) +@Measurement(iterations = 5, time = 2) +@Fork(value = 1,jvmArgsAppend = { "-Xmx4G", "-Xms4G", "-Djmh.executor=CUSTOM", "-Djmh.executor.class=org.apache.cassandra.test.microbench.FastThreadExecutor"}) +@Threads(1) +@State(Scope.Benchmark) +public class AbstractTypeByteSourceDecodingBench +{ + + private static final ByteComparable.Version LATEST = ByteComparable.Version.OSS42; + + private static final Map> PEEKABLE_GENERATOR_BY_TYPE = new HashMap<>(); + static + { + PEEKABLE_GENERATOR_BY_TYPE.put(UTF8Type.instance, (prng, length) -> + { + byte[] randomBytes = new byte[length]; + prng.nextBytes(randomBytes); + return ByteSource.peekable(ByteSource.of(new String(randomBytes, StandardCharsets.UTF_8), LATEST)); + }); + PEEKABLE_GENERATOR_BY_TYPE.put(BytesType.instance, (prng, length) -> + { + byte[] randomBytes = new byte[length]; + prng.nextBytes(randomBytes); + return ByteSource.peekable(ByteSource.of(randomBytes, LATEST)); + }); + PEEKABLE_GENERATOR_BY_TYPE.put(IntegerType.instance, (prng, length) -> + { + BigInteger randomVarint = BigInteger.valueOf(prng.nextLong()); + for (int i = 1; i < length / 8; ++i) + randomVarint = randomVarint.multiply(BigInteger.valueOf(prng.nextLong())); + return ByteSource.peekable(IntegerType.instance.asComparableBytes(IntegerType.instance.decompose(randomVarint), LATEST)); + }); + PEEKABLE_GENERATOR_BY_TYPE.put(DecimalType.instance, (prng, length) -> + { + BigInteger randomMantissa = BigInteger.valueOf(prng.nextLong()); + for (int i = 1; i < length / 8; ++i) + randomMantissa = randomMantissa.multiply(BigInteger.valueOf(prng.nextLong())); + int randomScale = prng.nextInt(Integer.MAX_VALUE >> 1) + Integer.MAX_VALUE >> 1; + BigDecimal randomDecimal = new BigDecimal(randomMantissa, randomScale); + return ByteSource.peekable(DecimalType.instance.asComparableBytes(DecimalType.instance.decompose(randomDecimal), LATEST)); + }); + } + + private Random prng = new Random(); + + @Param({"32", "128", "512"}) + private int length; + + @Param({"UTF8Type", "BytesType", "IntegerType", "DecimalType"}) + private String abstractTypeName; + + private AbstractType abstractType; + private BiFunction peekableGenerator; + + @Setup(Level.Trial) + public void setup() + { + abstractType = TypeParser.parse(abstractTypeName); + peekableGenerator = PEEKABLE_GENERATOR_BY_TYPE.get(abstractType); + } + + @Inline + private ByteSource.Peekable randomPeekableBytes() + { + return peekableGenerator.apply(prng, length); + } + + @Benchmark + public int baseline() + { + // Getting the source is not enough as its content is produced on next() calls. + ByteSource.Peekable source = randomPeekableBytes(); + int count = 0; + while (source.next() != ByteSource.END_OF_STREAM) + ++count; + return count; + } + + @Benchmark + public ByteBuffer fromComparableBytes() + { + ByteSource.Peekable peekableBytes = randomPeekableBytes(); + return abstractType.fromComparableBytes(peekableBytes, ByteComparable.Version.OSS42); + } +} diff --git a/test/unit/org/apache/cassandra/Util.java b/test/unit/org/apache/cassandra/Util.java index 0459cb3b3bed..b0b59641174e 100644 --- a/test/unit/org/apache/cassandra/Util.java +++ b/test/unit/org/apache/cassandra/Util.java @@ -23,6 +23,7 @@ import java.io.EOFException; import java.io.IOError; import java.io.IOException; +import java.math.BigInteger; import java.net.UnknownHostException; import java.nio.ByteBuffer; import java.nio.channels.FileChannel; @@ -859,6 +860,199 @@ public static void assertOnDiskState(ColumnFamilyStore cfs, int expectedSSTableC assertEquals(expectedSSTableCount, fileCount); } + public static ByteBuffer generateMurmurCollision(ByteBuffer original, byte... bytesToAdd) + { + // Round size up to 16, and add another 16 bytes + ByteBuffer collision = ByteBuffer.allocate((original.remaining() + bytesToAdd.length + 31) & -16); + collision.put(original); // we can use this as a copy of original with 0s appended at the end + + original.flip(); + + long c1 = 0x87c37b91114253d5L; + long c2 = 0x4cf5ad432745937fL; + + long h1 = 0; + long h2 = 0; + + // Get hash of original + int index = 0; + final int length = original.limit(); + while (index <= length - 16) + { + long k1 = Long.reverseBytes(collision.getLong(index + 0)); + long k2 = Long.reverseBytes(collision.getLong(index + 8)); + + // 16 bytes + k1 *= c1; + k1 = rotl64(k1, 31); + k1 *= c2; + h1 ^= k1; + h1 = rotl64(h1, 27); + h1 += h2; + h1 = h1 * 5 + 0x52dce729; + k2 *= c2; + k2 = rotl64(k2, 33); + k2 *= c1; + h2 ^= k2; + h2 = rotl64(h2, 31); + h2 += h1; + h2 = h2 * 5 + 0x38495ab5; + + index += 16; + } + + long oh1 = h1; + long oh2 = h2; + + // Process final unfilled chunk, but only adjust the original hash value + if (index < length) + { + long k1 = Long.reverseBytes(collision.getLong(index + 0)); + long k2 = Long.reverseBytes(collision.getLong(index + 8)); + + // 16 bytes + k1 *= c1; + k1 = rotl64(k1, 31); + k1 *= c2; + oh1 ^= k1; + + k2 *= c2; + k2 = rotl64(k2, 33); + k2 *= c1; + oh2 ^= k2; + } + + // These are the hashes the original would provide, before final mixing + oh1 ^= original.capacity(); + oh2 ^= original.capacity(); + + // Fill in the remaining bytes before the last 16 and get their hash + collision.put(bytesToAdd); + while ((collision.position() & 0x0f) != 0) + collision.put((byte) 0); + + while (index < collision.position()) + { + long k1 = Long.reverseBytes(collision.getLong(index + 0)); + long k2 = Long.reverseBytes(collision.getLong(index + 8)); + + // 16 bytes + k1 *= c1; + k1 = rotl64(k1, 31); + k1 *= c2; + h1 ^= k1; + h1 = rotl64(h1, 27); + h1 += h2; + h1 = h1 * 5 + 0x52dce729; + k2 *= c2; + k2 = rotl64(k2, 33); + k2 *= c1; + h2 ^= k2; + h2 = rotl64(h2, 31); + h2 += h1; + h2 = h2 * 5 + 0x38495ab5; + + index += 16; + } + + // Working backwards, we must get this hash pair + long th1 = h1; + long th2 = h2; + + // adjust ohx with length + h1 = oh1 ^ collision.capacity(); + h2 = oh2 ^ collision.capacity(); + + // Get modulo-long inverses of the multipliers used in the computation + long i5i = inverse(5L); + long c1i = inverse(c1); + long c2i = inverse(c2); + + // revert one step + h2 -= 0x38495ab5; + h2 *= i5i; + h2 -= h1; + h2 = rotl64(h2, 33); + + h1 -= 0x52dce729; + h1 *= i5i; + h1 -= th2; // use h2 before it's adjusted with k2 + h1 = rotl64(h1, 37); + + // extract the required modifiers and applies the inverse of their transformation + long k1 = h1 ^ th1; + k1 = c2i * k1; + k1 = rotl64(k1, 33); + k1 = c1i * k1; + + long k2 = h2 ^ th2; + k2 = c1i * k2; + k2 = rotl64(k2, 31); + k2 = c2i * k2; + + collision.putLong(Long.reverseBytes(k1)); + collision.putLong(Long.reverseBytes(k2)); + collision.flip(); + + return collision; + } + + // Assumes a and b are positive + private static BigInteger[] xgcd(BigInteger a, BigInteger b) { + BigInteger x = a, y = b; + BigInteger[] qrem; + BigInteger[] result = new BigInteger[3]; + BigInteger x0 = BigInteger.ONE, x1 = BigInteger.ZERO; + BigInteger y0 = BigInteger.ZERO, y1 = BigInteger.ONE; + while (true) + { + qrem = x.divideAndRemainder(y); + x = qrem[1]; + x0 = x0.subtract(y0.multiply(qrem[0])); + x1 = x1.subtract(y1.multiply(qrem[0])); + if (x.equals(BigInteger.ZERO)) + { + result[0] = y; + result[1] = y0; + result[2] = y1; + return result; + } + + qrem = y.divideAndRemainder(x); + y = qrem[1]; + y0 = y0.subtract(x0.multiply(qrem[0])); + y1 = y1.subtract(x1.multiply(qrem[0])); + if (y.equals(BigInteger.ZERO)) + { + result[0] = x; + result[1] = x0; + result[2] = x1; + return result; + } + } + } + + /** + * Find a mupltiplicative inverse for the given multiplier for long, i.e. + * such that x * inverse(x) = 1 where * is long multiplication. + * In other words, such an integer that x * inverse(x) == 1 (mod 2^64). + */ + public static long inverse(long multiplier) + { + final BigInteger modulus = BigInteger.ONE.shiftLeft(64); + // Add the modulus to the multiplier to avoid problems with negatives (a + m == a (mod m)) + BigInteger[] gcds = xgcd(BigInteger.valueOf(multiplier).add(modulus), modulus); + // xgcd gives g, a and b, such that ax + bm = g + // ie, ax = g (mod m). Return a + assert gcds[0].equals(BigInteger.ONE) : "Even number " + multiplier + " has no long inverse"; + return gcds[1].longValueExact(); + } + + public static long rotl64(long v, int n) + { + return ((v << n) | (v >>> (64 - n))); + } + /** * Disable bloom filter on all sstables of given table */ diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/TupleTypeTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/TupleTypeTest.java index 9f53db4966c3..f32565554239 100644 --- a/test/unit/org/apache/cassandra/cql3/validation/entities/TupleTypeTest.java +++ b/test/unit/org/apache/cassandra/cql3/validation/entities/TupleTypeTest.java @@ -33,6 +33,8 @@ import org.apache.cassandra.cql3.CQLTester; import org.apache.cassandra.cql3.UntypedResultSet; +import org.apache.cassandra.db.SchemaCQLHelper; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; import org.apache.cassandra.db.marshal.TupleType; import org.apache.cassandra.utils.AbstractTypeGenerators.TypeSupport; import org.quicktheories.core.Gen; @@ -266,7 +268,7 @@ public void tuplePartitionReadWrite() for (ByteBuffer value : testcase.uniqueRows) { map.put(value, count); - ByteBuffer[] tupleBuffers = tupleType.split(value); + ByteBuffer[] tupleBuffers = tupleType.split(ByteBufferAccessor.instance, value); // use cast to avoid warning execute("INSERT INTO %s (id, value) VALUES (?, ?)", tuple((Object[]) tupleBuffers), count); @@ -304,7 +306,7 @@ private void tupleCkReadWrite(Order order) for (ByteBuffer value : testcase.uniqueRows) { map.put(value, count); - ByteBuffer[] tupleBuffers = tupleType.split(value); + ByteBuffer[] tupleBuffers = tupleType.split(ByteBufferAccessor.instance, value); // use cast to avoid warning execute("INSERT INTO %s (pk, ck, value) VALUES (?, ?, ?)", 1, tuple((Object[]) tupleBuffers), count); diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/UserTypesTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/UserTypesTest.java index 0b05e8f01343..1520b4cab98f 100644 --- a/test/unit/org/apache/cassandra/cql3/validation/entities/UserTypesTest.java +++ b/test/unit/org/apache/cassandra/cql3/validation/entities/UserTypesTest.java @@ -199,6 +199,56 @@ public void testAlterUDT() throws Throwable ); } + @Test + public void testNullsInIntUDT() throws Throwable + { + String myType = KEYSPACE + '.' + createType("CREATE TYPE %s (a int)"); + createTable("CREATE TABLE %s (a int PRIMARY KEY, b frozen<" + myType + ">)"); + execute("INSERT INTO %s (a, b) VALUES (1, ?)", userType("a", 1)); + + assertRows(execute("SELECT b.a FROM %s"), row(1)); + + flush(); + + schemaChange("ALTER TYPE " + myType + " ADD b int"); + execute("INSERT INTO %s (a, b) VALUES (2, {a: 2, b: 2})"); + execute("INSERT INTO %s (a, b) VALUES (3, {b: 3})"); + execute("INSERT INTO %s (a, b) VALUES (4, {a: null, b: 4})"); + + beforeAndAfterFlush(() -> + assertRows(execute("SELECT b.a, b.b FROM %s"), + row(1, null), + row(2, 2), + row(null, 3), + row(null, 4)) + ); + } + + @Test + public void testNullsInTextUDT() throws Throwable + { + String myType = KEYSPACE + '.' + createType("CREATE TYPE %s (a text)"); + createTable("CREATE TABLE %s (a int PRIMARY KEY, b frozen<" + myType + ">)"); + execute("INSERT INTO %s (a, b) VALUES (1, {a: ''})"); + + assertRows(execute("SELECT b.a FROM %s"), row("")); + + flush(); + + schemaChange("ALTER TYPE " + myType + " ADD b text"); + execute("INSERT INTO %s (a, b) VALUES (2, {a: '', b: ''})"); + execute("INSERT INTO %s (a, b) VALUES (3, {b: ''})"); + execute("INSERT INTO %s (a, b) VALUES (4, {a: null, b: ''})"); + + beforeAndAfterFlush(() -> + assertRows(execute("SELECT b.a, b.b FROM %s"), + row("", null), + row("", ""), + row(null, ""), + row(null, "")) + ); + } + @Test public void testAlterNonFrozenUDT() throws Throwable { diff --git a/test/unit/org/apache/cassandra/db/marshal/DynamicCompositeTypeTest.java b/test/unit/org/apache/cassandra/db/marshal/DynamicCompositeTypeTest.java index d22a8ac933f9..69c1eb5cd808 100644 --- a/test/unit/org/apache/cassandra/db/marshal/DynamicCompositeTypeTest.java +++ b/test/unit/org/apache/cassandra/db/marshal/DynamicCompositeTypeTest.java @@ -47,7 +47,7 @@ public class DynamicCompositeTypeTest { private static final String KEYSPACE1 = "DynamicCompositeType"; private static final String CF_STANDARDDYNCOMPOSITE = "StandardDynamicComposite"; - private static Map> aliases = new HashMap<>(); + public static Map> aliases = new HashMap<>(); private static final DynamicCompositeType comparator; static @@ -60,7 +60,7 @@ public class DynamicCompositeTypeTest } private static final int UUID_COUNT = 3; - private static final UUID[] uuids = new UUID[UUID_COUNT]; + public static final UUID[] uuids = new UUID[UUID_COUNT]; static { for (int i = 0; i < UUID_COUNT; ++i) @@ -320,13 +320,12 @@ public void testCompatibility() throws Exception assert !TypeParser.parse("DynamicCompositeType(a => BytesType)").isCompatibleWith(TypeParser.parse("DynamicCompositeType(a => BytesType, b => AsciiType)")); } - private ByteBuffer createDynamicCompositeKey(String s, UUID uuid, int i, boolean lastIsOne) + private static ByteBuffer createDynamicCompositeKey(String s, UUID uuid, int i, boolean lastIsOne) { return createDynamicCompositeKey(s, uuid, i, lastIsOne, false); } - private ByteBuffer createDynamicCompositeKey(String s, UUID uuid, int i, boolean lastIsOne, - final boolean reversed) + public static ByteBuffer createDynamicCompositeKey(String s, UUID uuid, int i, boolean lastIsOne, boolean reversed) { String intType = (reversed ? "ReversedType(IntegerType)" : "IntegerType"); ByteBuffer bytes = ByteBufferUtil.bytes(s); diff --git a/test/unit/org/apache/cassandra/db/marshal/TypeValidationTest.java b/test/unit/org/apache/cassandra/db/marshal/TypeValidationTest.java index 4d25a1f62bbf..474b867007f1 100644 --- a/test/unit/org/apache/cassandra/db/marshal/TypeValidationTest.java +++ b/test/unit/org/apache/cassandra/db/marshal/TypeValidationTest.java @@ -204,7 +204,7 @@ private static void buildAndSplit(Gen baseGen) qt().forAll(tupleWithValueGen(baseGen)).checkAssert(pair -> { TupleType tuple = pair.left; ByteBuffer value = pair.right; - Assertions.assertThat(TupleType.buildValue(tuple.split(value))) + Assertions.assertThat(TupleType.buildValue(tuple.split(ByteBufferAccessor.instance, value))) .as("TupleType.buildValue(split(value)) == value") .isEqualTo(value); }); diff --git a/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java b/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java index 5b5365da099b..c24690b8bf7a 100644 --- a/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java +++ b/test/unit/org/apache/cassandra/dht/KeyCollisionTest.java @@ -27,6 +27,7 @@ import org.apache.cassandra.SchemaLoader; import org.apache.cassandra.Util; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.db.marshal.IntegerType; import org.apache.cassandra.schema.Schema; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Keyspace; @@ -36,6 +37,8 @@ import org.apache.cassandra.schema.KeyspaceParams; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; import org.apache.cassandra.utils.FBUtilities; /** @@ -124,5 +127,11 @@ public long getHeapSize() { return 0; } + + @Override + public ByteSource asComparableBytes(ByteComparable.Version version) + { + return IntegerType.instance.asComparableBytes(IntegerType.instance.decompose(token), version); + } } } diff --git a/test/unit/org/apache/cassandra/dht/LengthPartitioner.java b/test/unit/org/apache/cassandra/dht/LengthPartitioner.java index c4e5db82849f..ca6504ced85c 100644 --- a/test/unit/org/apache/cassandra/dht/LengthPartitioner.java +++ b/test/unit/org/apache/cassandra/dht/LengthPartitioner.java @@ -34,6 +34,8 @@ import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.bytecomparable.ByteComparable; +import org.apache.cassandra.utils.bytecomparable.ByteSource; public class LengthPartitioner implements IPartitioner { @@ -95,6 +97,11 @@ public Token fromByteArray(ByteBuffer bytes) return new BigIntegerToken(new BigInteger(ByteBufferUtil.getArray(bytes))); } + public Token fromComparableBytes(ByteSource.Peekable comparableBytes, ByteComparable.Version version) + { + return fromByteArray(IntegerType.instance.fromComparableBytes(comparableBytes, version)); + } + public String toString(Token token) { BigIntegerToken bigIntegerToken = (BigIntegerToken) token; diff --git a/test/unit/org/apache/cassandra/transport/SerDeserTest.java b/test/unit/org/apache/cassandra/transport/SerDeserTest.java index da76070b6f0a..75523e1587cd 100644 --- a/test/unit/org/apache/cassandra/transport/SerDeserTest.java +++ b/test/unit/org/apache/cassandra/transport/SerDeserTest.java @@ -238,7 +238,7 @@ public void udtSerDeserTest(ProtocolVersion version) throws Exception ByteBuffer serialized = t.bindAndGet(options); - ByteBuffer[] fields = udt.split(serialized); + ByteBuffer[] fields = udt.split(ByteBufferAccessor.instance, serialized); assertEquals(4, fields.length); diff --git a/test/unit/org/apache/cassandra/utils/bytecomparable/AbstractTypeByteSourceTest.java b/test/unit/org/apache/cassandra/utils/bytecomparable/AbstractTypeByteSourceTest.java new file mode 100644 index 000000000000..d5e2f1eea904 --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/bytecomparable/AbstractTypeByteSourceTest.java @@ -0,0 +1,1015 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.utils.bytecomparable; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.net.InetAddress; +import java.net.UnknownHostException; +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Date; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Random; +import java.util.Set; +import java.util.UUID; +import java.util.function.BiFunction; +import java.util.function.Consumer; +import java.util.function.Supplier; +import java.util.stream.IntStream; +import java.util.stream.Stream; + +import com.google.common.collect.ImmutableList; + +import org.apache.cassandra.cql3.Term; +import org.apache.cassandra.db.marshal.*; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import org.apache.cassandra.cql3.Duration; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.LengthPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.RandomPartitioner; +import org.apache.cassandra.serializers.MarshalException; +import org.apache.cassandra.serializers.SimpleDateSerializer; +import org.apache.cassandra.serializers.TypeSerializer; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.TimeUUID; +import org.apache.cassandra.utils.UUIDGen; + +@RunWith(Parameterized.class) +public class AbstractTypeByteSourceTest +{ + private static final String ALPHABET = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()"; + + @Parameterized.Parameters(name = "version={0}") + public static Iterable versions() + { + return ImmutableList.of(ByteComparable.Version.OSS42); + } + + private final ByteComparable.Version version; + + public AbstractTypeByteSourceTest(ByteComparable.Version version) + { + this.version = version; + } + + private void testValuesForType(AbstractType type, T... values) + { + testValuesForType(type, Arrays.asList(values)); + } + + private void testValuesForType(AbstractType type, List values) + { + for (T initial : values) + decodeAndAssertEquals(type, initial); + if (IntegerType.instance.equals(type)) + // IntegerType tests go through A LOT of values, so short of randomly picking up to, let's say 1000 + // values to combine with, we'd rather skip the comparison tests for them. + return; + for (int i = 0; i < values.size(); ++i) + { + for (int j = i + 1; j < values.size(); ++j) + { + ByteBuffer left = type.decompose(values.get(i)); + ByteBuffer right = type.decompose(values.get(j)); + int compareBuffers = Integer.signum(type.compare(left, right)); + ByteSource leftSource = type.asComparableBytes(left.duplicate(), version); + ByteSource rightSource = type.asComparableBytes(right.duplicate(), version); + int compareBytes = Integer.signum(ByteComparable.compare(v -> leftSource, v -> rightSource, version)); + Assert.assertEquals(compareBuffers, compareBytes); + } + } + } + + private void testValuesForType(AbstractType type, Stream values) + { + values.forEach(initial -> decodeAndAssertEquals(type, initial)); + } + + private void decodeAndAssertEquals(AbstractType type, T initial) + { + ByteBuffer initialBuffer = type.decompose(initial); + // Assert that fromComparableBytes decodes correctly. + ByteSource.Peekable peekableBytes = ByteSource.peekable(type.asComparableBytes(initialBuffer, version)); + ByteBuffer decodedBuffer = type.fromComparableBytes(peekableBytes, version); + Assert.assertEquals("For " + ByteSourceComparisonTest.safeStr(initial), + ByteBufferUtil.bytesToHex(initialBuffer), + ByteBufferUtil.bytesToHex(decodedBuffer)); + // Assert that the value composed from fromComparableBytes is the correct one. + peekableBytes = ByteSource.peekable(type.asComparableBytes(initialBuffer, version)); + T decoded = type.compose(type.fromComparableBytes(peekableBytes, version)); + Assert.assertEquals(initial, decoded); + } + + private static String newRandomAlphanumeric(Random prng, int length) + { + StringBuilder random = new StringBuilder(length); + for (int i = 0; i < length; ++i) + random.append(ALPHABET.charAt(prng.nextInt(ALPHABET.length()))); + return random.toString(); + } + + @Test + public void testAsciiType() + { + String[] asciiStrings = new String[] + { + "", + "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz1234567890", + "!@#$%^&*()", + }; + testValuesForType(AsciiType.instance, asciiStrings); + + Random prng = new Random(); + Stream asciiStream = Stream.generate(() -> newRandomAlphanumeric(prng, 10)).limit(1000); + testValuesForType(AsciiType.instance, asciiStream); + } + + @Test + public void testBooleanType() + { + testValuesForType(BooleanType.instance, Boolean.TRUE, Boolean.FALSE, null); + } + + @Test + public void testBytesType() + { + List byteBuffers = new ArrayList<>(); + Random prng = new Random(); + byte[] byteArray; + int[] arrayLengths = new int[] {1, 10, 100, 1000}; + for (int length : arrayLengths) + { + byteArray = new byte[length]; + for (int i = 0; i < 1000; ++i) + { + prng.nextBytes(byteArray); + byteBuffers.add(ByteBuffer.wrap(byteArray)); + } + } + testValuesForType(BytesType.instance, byteBuffers.toArray(new ByteBuffer[0])); + } + + @Test + public void testByteType() + { + testValuesForType(ByteType.instance, new Byte[] { null }); + + Stream allBytes = IntStream.range(Byte.MIN_VALUE, Byte.MAX_VALUE + 1) + .mapToObj(value -> (byte) value); + testValuesForType(ByteType.instance, allBytes); + } + + @Test + public void testCompositeType() + { + CompositeType compType = CompositeType.getInstance(UTF8Type.instance, TimeUUIDType.instance, IntegerType.instance); + List byteBuffers = new ArrayList<>(); + Random prng = new Random(); + // Test with complete CompositeType rows + for (int i = 0; i < 1000; ++i) + { + String randomString = newRandomAlphanumeric(prng, 10); + TimeUUID randomUuid = TimeUUID.Generator.nextTimeUUID(); + BigInteger randomVarint = BigInteger.probablePrime(80, prng); + byteBuffers.add(compType.decompose(randomString, randomUuid, randomVarint)); + } + // Test with incomplete CompositeType rows, where only the first element is present + ByteBuffer[] incompleteComposite = new ByteBuffer[1]; + incompleteComposite[0] = UTF8Type.instance.decompose(newRandomAlphanumeric(prng, 10)); + byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, true, incompleteComposite)); + byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, false, incompleteComposite)); + // ...and the last end-of-component byte is not 0. + byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, true, incompleteComposite, (byte) 1)); + byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, false, incompleteComposite, (byte) 1)); + byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, true, incompleteComposite, (byte) -1)); + byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, false, incompleteComposite, (byte) -1)); + // Test with incomplete CompositeType rows, where only the last element is not present + incompleteComposite = new ByteBuffer[2]; + incompleteComposite[0] = UTF8Type.instance.decompose(newRandomAlphanumeric(prng, 10)); + incompleteComposite[1] = TimeUUIDType.instance.decompose(TimeUUID.Generator.nextTimeUUID()); + byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, true, incompleteComposite)); + byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, false, incompleteComposite)); + // ...and the last end-of-component byte is not 0. + byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, true, incompleteComposite, (byte) 1)); + byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, false, incompleteComposite, (byte) 1)); + byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, true, incompleteComposite, (byte) -1)); + byteBuffers.add(CompositeType.build(ByteBufferAccessor.instance, false, incompleteComposite, (byte) -1)); + + testValuesForType(compType, byteBuffers.toArray(new ByteBuffer[0])); + } + + @Test + public void testDateType() + { + Stream dates = Stream.of(null, + new Date(Long.MIN_VALUE), + new Date(Long.MAX_VALUE), + new Date()); + testValuesForType(DateType.instance, dates); + + dates = new Random().longs(1000).mapToObj(Date::new); + testValuesForType(DateType.instance, dates); + } + + @Test + public void testDecimalType() + { + // We won't be using testValuesForType for DecimalType (i.e. we won't also be comparing the initial and decoded + // ByteBuffer values). That's because the same BigDecimal value can be represented with a couple of different, + // even if equivalent pairs of (e.g. 0.1 is 1 * e-1, as well as 10 * e-2, as well as...). + // And in practice it's easier to just convert to BigDecimals and then compare, instead of trying to manually + // decode and convert to canonical representations, which then to compare. For example of generating canonical + // decimals in the first place, see testReversedType(). + Consumer bigDecimalConsumer = initial -> + { + ByteSource byteSource = DecimalType.instance.asComparableBytes(DecimalType.instance.decompose(initial), version); + BigDecimal decoded = DecimalType.instance.compose(DecimalType.instance.fromComparableBytes(ByteSource.peekable(byteSource), version)); + if (initial == null) + Assert.assertNull(decoded); + else + Assert.assertEquals(0, initial.compareTo(decoded)); + }; + // Test some interesting predefined BigDecimal values. + Stream.of(null, + BigDecimal.ZERO, + BigDecimal.ONE, + BigDecimal.ONE.add(BigDecimal.ONE), + BigDecimal.TEN, + BigDecimal.valueOf(0.0000000000000000000000000000000001), + BigDecimal.valueOf(-0.0000000000000000000000000000000001), + BigDecimal.valueOf(0.0000000000000001234567891011121314), + BigDecimal.valueOf(-0.0000000000000001234567891011121314), + BigDecimal.valueOf(12345678910111213.141516171819202122), + BigDecimal.valueOf(-12345678910111213.141516171819202122), + new BigDecimal(BigInteger.TEN, Integer.MIN_VALUE), + new BigDecimal(BigInteger.TEN.negate(), Integer.MIN_VALUE), + new BigDecimal(BigInteger.TEN, Integer.MAX_VALUE), + new BigDecimal(BigInteger.TEN.negate(), Integer.MAX_VALUE), + new BigDecimal(BigInteger.TEN.pow(1000), Integer.MIN_VALUE), + new BigDecimal(BigInteger.TEN.pow(1000).negate(), Integer.MIN_VALUE), + new BigDecimal(BigInteger.TEN.pow(1000), Integer.MAX_VALUE), + new BigDecimal(BigInteger.TEN.pow(1000).negate(), Integer.MAX_VALUE)) + .forEach(bigDecimalConsumer); + // Test BigDecimals created from random double values with predefined range modifiers. + double[] bounds = { + Double.MIN_VALUE, + -1_000_000_000.0, + -100_000.0, + -1.0, + 1.0, + 100_000.0, + 1_000_000_000.0, + Double.MAX_VALUE}; + for (double bound : bounds) + { + new Random().doubles(1000) + .mapToObj(initial -> BigDecimal.valueOf(initial * bound)) + .forEach(bigDecimalConsumer); + } + } + + @Test + public void testDoubleType() + { + Stream doubles = Stream.of(null, + Double.NaN, + Double.POSITIVE_INFINITY, + Double.NEGATIVE_INFINITY, + Double.MAX_VALUE, + Double.MIN_VALUE, + +0.0, + -0.0, + +1.0, + -1.0, + +12345678910.111213141516, + -12345678910.111213141516); + testValuesForType(DoubleType.instance, doubles); + + doubles = new Random().doubles(1000).boxed(); + testValuesForType(DoubleType.instance, doubles); + } + + @Test + public void testDurationType() + { + Random prng = new Random(); + Stream posDurations = Stream.generate(() -> + { + int months = prng.nextInt(12) + 1; + int days = prng.nextInt(28) + 1; + long nanos = (Math.abs(prng.nextLong() % 86_400_000_000_000L)) + 1; + return Duration.newInstance(months, days, nanos); + }) + .limit(1000); + testValuesForType(DurationType.instance, posDurations); + Stream negDurations = Stream.generate(() -> + { + int months = prng.nextInt(12) + 1; + int days = prng.nextInt(28) + 1; + long nanos = (Math.abs(prng.nextLong() % 86_400_000_000_000L)) + 1; + return Duration.newInstance(-months, -days, -nanos); + }) + .limit(1000); + testValuesForType(DurationType.instance, negDurations); + } + + @Test + public void testDynamicCompositeType() + { + DynamicCompositeType dynamicCompType = DynamicCompositeType.getInstance(new HashMap<>()); + ImmutableList allTypes = ImmutableList.of("org.apache.cassandra.db.marshal.BytesType", + "org.apache.cassandra.db.marshal.TimeUUIDType", + "org.apache.cassandra.db.marshal.IntegerType"); + List allValues = new ArrayList<>(); + List byteBuffers = new ArrayList<>(); + Random prng = new Random(); + for (int i = 0; i < 10; ++i) + { + String randomString = newRandomAlphanumeric(prng, 10); + allValues.add(ByteBufferUtil.bytes(randomString)); + UUID randomUuid = TimeUUID.Generator.nextTimeAsUUID(); + allValues.add(ByteBuffer.wrap(UUIDGen.decompose(randomUuid))); + byte randomByte = (byte) prng.nextInt(); + allValues.add(ByteBuffer.allocate(1).put(randomByte)); + + // Three-component key with aliased and non-aliased types and end-of-component byte varying (0, 1, -1). + byteBuffers.add(DynamicCompositeType.build(allTypes, allValues)); + byteBuffers.add(createStringUuidVarintDynamicCompositeKey(randomString, randomUuid, randomByte, (byte) 1)); + byteBuffers.add(createStringUuidVarintDynamicCompositeKey(randomString, randomUuid, randomByte, (byte) -1)); + + // Two-component key with aliased and non-aliased types and end-of-component byte varying (0, 1, -1). + byteBuffers.add(DynamicCompositeType.build(allTypes.subList(0, 2), allValues.subList(0, 2))); + byteBuffers.add(createStringUuidVarintDynamicCompositeKey(randomString, randomUuid, -1, (byte) 1)); + byteBuffers.add(createStringUuidVarintDynamicCompositeKey(randomString, randomUuid, -1, (byte) -1)); + + // One-component key with aliased and non-aliased type and end-of-component byte varying (0, 1, -1). + byteBuffers.add(DynamicCompositeType.build(allTypes.subList(0, 1), allValues.subList(0, 1))); + byteBuffers.add(createStringUuidVarintDynamicCompositeKey(randomString, null, -1, (byte) 1)); + byteBuffers.add(createStringUuidVarintDynamicCompositeKey(randomString, null, -1, (byte) -1)); + + allValues.clear(); + } + testValuesForType(dynamicCompType, byteBuffers.toArray(new ByteBuffer[0])); + } + + // Similar to DynamicCompositeTypeTest.createDynamicCompositeKey(string, uuid, i, true, false), but not using any + // aliased types, in order to do an exact comparison of the unmarshalled DynamicCompositeType payload with the + // input one. If aliased types are used, due to DynamicCompositeType.build(List, List) + // always including the full type info in the newly constructed payload, an exact comparison won't work. + private static ByteBuffer createStringUuidVarintDynamicCompositeKey(String string, UUID uuid, int i, byte lastEocByte) + { + // 1. Calculate how many bytes do we need for a key of this DynamicCompositeType + String bytesType = "org.apache.cassandra.db.marshal.BytesType"; + String timeUuidType = "org.apache.cassandra.db.marshal.TimeUUIDType"; + String varintType = "org.apache.cassandra.db.marshal.IntegerType"; + ByteBuffer bytes = ByteBufferUtil.bytes(string); + int totalSize = 0; + // Take into account the string component data (BytesType is aliased) + totalSize += 2 + bytesType.length() + 2 + bytes.remaining() + 1; + if (uuid != null) + { + // Take into account the UUID component data (TimeUUIDType is aliased) + totalSize += 2 + timeUuidType.length() + 2 + 16 + 1; + if (i != -1) + { + // Take into account the varint component data (IntegerType is _not_ aliased). + // Notice that we account for a single byte of varint data, so we'll downcast the int payload + // to byte and use only that as the actual varint payload. + totalSize += 2 + varintType.length() + 2 + 1 + 1; + } + } + + // 2. Allocate a buffer with that many bytes + ByteBuffer bb = ByteBuffer.allocate(totalSize); + + // 3. Write the key data for each component in the allocated buffer + bb.putShort((short) bytesType.length()); + bb.put(ByteBufferUtil.bytes(bytesType)); + bb.putShort((short) bytes.remaining()); + bb.put(bytes); + // Make the end-of-component byte 1 if requested and the time-UUID component is null. + bb.put(uuid == null ? lastEocByte : (byte) 0); + if (uuid != null) + { + bb.putShort((short) timeUuidType.length()); + bb.put(ByteBufferUtil.bytes(timeUuidType)); + bb.putShort((short) 16); + bb.put(UUIDGen.decompose(uuid)); + // Set the end-of-component byte if requested and the varint component is null. + bb.put(i == -1 ? lastEocByte : (byte) 0); + if (i != -1) + { + bb.putShort((short) varintType.length()); + bb.put(ByteBufferUtil.bytes(varintType)); + bb.putShort((short) 1); + bb.put((byte) i); + bb.put(lastEocByte); + } + } + bb.rewind(); + return bb; + } + + @Test + public void testFloatType() + { + Stream floats = Stream.of(null, + Float.NaN, + Float.POSITIVE_INFINITY, + Float.NEGATIVE_INFINITY, + Float.MAX_VALUE, + Float.MIN_VALUE, + +0.0F, + -0.0F, + +1.0F, + -1.0F, + +123456.7891011F, + -123456.7891011F); + testValuesForType(FloatType.instance, floats); + + floats = new Random().ints(1000).mapToObj(Float::intBitsToFloat); + testValuesForType(FloatType.instance, floats); + } + + @Test + public void testInetAddressType() throws UnknownHostException + { + Stream inetAddresses = Stream.of(null, + InetAddress.getLocalHost(), + InetAddress.getLoopbackAddress(), + InetAddress.getByName("0.0.0.0"), + InetAddress.getByName("10.0.0.1"), + InetAddress.getByName("172.16.1.1"), + InetAddress.getByName("192.168.2.2"), + InetAddress.getByName("224.3.3.3"), + InetAddress.getByName("255.255.255.255"), + InetAddress.getByName("0000:0000:0000:0000:0000:0000:0000:0000"), + InetAddress.getByName("ffff:ffff:ffff:ffff:ffff:ffff:ffff:ffff"), + InetAddress.getByName("fe80:1:23:456:7890:1:23:456")); + testValuesForType(InetAddressType.instance, inetAddresses); + + Random prng = new Random(); + byte[] ipv4Bytes = new byte[4]; + byte[] ipv6Bytes = new byte[16]; + InetAddress[] addresses = new InetAddress[2000]; + for (int i = 0; i < addresses.length / 2; ++i) + { + prng.nextBytes(ipv4Bytes); + addresses[2 * i] = InetAddress.getByAddress(ipv4Bytes); + addresses[2 * i + 1] = InetAddress.getByAddress(ipv6Bytes); + } + testValuesForType(InetAddressType.instance, addresses); + + } + + @Test + public void testInt32Type() + { + Stream ints = Stream.of(null, + Integer.MIN_VALUE, + Integer.MIN_VALUE + 1, + -256, -255, -128, -127, -1, + 0, + 1, 127, 128, 255, 256, + Integer.MAX_VALUE - 1, + Integer.MAX_VALUE); + testValuesForType(Int32Type.instance, ints); + + ints = new Random().ints(1000).boxed(); + testValuesForType(Int32Type.instance, ints); + } + + @Test + public void testIntegerType() + { + Stream varints = IntStream.range(-1000000, 1000000).mapToObj(BigInteger::valueOf); + testValuesForType(IntegerType.instance, varints); + + varints = Stream.of(null, + BigInteger.valueOf(12345678910111213L), + BigInteger.valueOf(12345678910111213L).negate(), + BigInteger.valueOf(Long.MAX_VALUE), + BigInteger.valueOf(Long.MAX_VALUE).negate(), + BigInteger.valueOf(Long.MAX_VALUE - 1).multiply(BigInteger.valueOf(Long.MAX_VALUE - 1)), + BigInteger.valueOf(Long.MAX_VALUE - 1).multiply(BigInteger.valueOf(Long.MAX_VALUE - 1)).negate()); + testValuesForType(IntegerType.instance, varints); + + List varintList = new ArrayList<>(); + for (int i = 0; i < 10000; ++i) + { + BigInteger initial = BigInteger.ONE.shiftLeft(i); + varintList.add(initial); + BigInteger plusOne = initial.add(BigInteger.ONE); + varintList.add(plusOne); + varintList.add(plusOne.negate()); + BigInteger minusOne = initial.subtract(BigInteger.ONE); + varintList.add(minusOne); + varintList.add(minusOne.negate()); + } + testValuesForType(IntegerType.instance, varintList.toArray(new BigInteger[0])); + } + + @Test + public void testUuidTypes() + { + Random prng = new Random(); + UUID[] testUuids = new UUID[3001]; + for (int i = 0; i < testUuids.length / 3; ++i) + { + testUuids[3 * i] = UUID.randomUUID(); + testUuids[3 * i + 1] = TimeUUID.Generator.nextTimeAsUUID(); + testUuids[3 * i + 2] = TimeUUID.atUnixMicrosWithLsbAsUUID(prng.nextLong(), prng.nextLong()); + } + testUuids[testUuids.length - 1] = null; + testValuesForType(UUIDType.instance, testUuids); + testValuesForType(LexicalUUIDType.instance, testUuids); + testValuesForType(TimeUUIDType.instance, Arrays.stream(testUuids) + .filter(u -> u == null || u.version() == 1) + .map(u -> u != null ? TimeUUID.fromUuid(u) : null)); + } + + private static > List newRandomElementCollections(Supplier collectionProducer, + Supplier elementProducer, + int numCollections, + int numElementsInCollection) + { + List result = new ArrayList<>(); + for (int i = 0; i < numCollections; ++i) + { + C coll = collectionProducer.get(); + for (int j = 0; j < numElementsInCollection; ++j) + { + coll.add(elementProducer.get()); + } + result.add(coll); + } + return result; + } + + @Test + public void testListType() + { + // Test lists with element components not having known/computable length (e.g. strings). + Random prng = new Random(); + List> stringLists = newRandomElementCollections(ArrayList::new, + () -> newRandomAlphanumeric(prng, 10), + 100, + 100); + testValuesForType(ListType.getInstance(UTF8Type.instance, false), stringLists); + testValuesForType(ListType.getInstance(UTF8Type.instance, true), stringLists); + // Test lists with element components with known/computable length (e.g. 128-bit UUIDs). + List> uuidLists = newRandomElementCollections(ArrayList::new, + UUID::randomUUID, + 100, + 100); + testValuesForType(ListType.getInstance(UUIDType.instance, false), uuidLists); + testValuesForType(ListType.getInstance(UUIDType.instance, true), uuidLists); + } + + @Test + public void testLongType() + { + Stream longs = Stream.of(null, + Long.MIN_VALUE, + Long.MIN_VALUE + 1, + (long) Integer.MIN_VALUE - 1, + -256L, -255L, -128L, -127L, -1L, + 0L, + 1L, 127L, 128L, 255L, 256L, + (long) Integer.MAX_VALUE + 1, + Long.MAX_VALUE - 1, + Long.MAX_VALUE); + testValuesForType(LongType.instance, longs); + + longs = new Random().longs(1000).boxed(); + testValuesForType(LongType.instance, longs); + } + + private static List> newRandomEntryMaps(Supplier keyProducer, + Supplier valueProducer, + int numMaps, + int numEntries) + { + List> result = new ArrayList<>(); + for (int i = 0; i < numMaps; ++i) + { + Map map = new HashMap<>(); + for (int j = 0; j < numEntries; ++j) + { + K key = keyProducer.get(); + V value = valueProducer.get(); + map.put(key, value); + } + result.add(map); + } + return result; + } + + @Test + public void testMapType() + { + Random prng = new Random(); + List> stringToUuidMaps = newRandomEntryMaps(() -> newRandomAlphanumeric(prng, 10), + UUID::randomUUID, + 100, + 100); + testValuesForType(MapType.getInstance(UTF8Type.instance, UUIDType.instance, false), stringToUuidMaps); + testValuesForType(MapType.getInstance(UTF8Type.instance, UUIDType.instance, true), stringToUuidMaps); + + List> uuidToStringMaps = newRandomEntryMaps(UUID::randomUUID, + () -> newRandomAlphanumeric(prng, 10), + 100, + 100); + testValuesForType(MapType.getInstance(UUIDType.instance, UTF8Type.instance, false), uuidToStringMaps); + testValuesForType(MapType.getInstance(UUIDType.instance, UTF8Type.instance, true), uuidToStringMaps); + } + + @Test + public void testPartitionerDefinedOrder() + { + Random prng = new Random(); + List byteBuffers = new ArrayList<>(); + byteBuffers.add(ByteBufferUtil.EMPTY_BYTE_BUFFER); + for (int i = 0; i < 1000; ++i) + { + String randomString = newRandomAlphanumeric(prng, 10); + byteBuffers.add(UTF8Type.instance.decompose(randomString)); + int randomInt = prng.nextInt(); + byteBuffers.add(Int32Type.instance.decompose(randomInt)); + double randomDouble = prng.nextDouble(); + byteBuffers.add(DoubleType.instance.decompose(randomDouble)); + BigInteger randomishVarint = BigInteger.probablePrime(100, prng); + byteBuffers.add(IntegerType.instance.decompose(randomishVarint)); + BigDecimal randomishDecimal = BigDecimal.valueOf(prng.nextLong(), prng.nextInt(100) - 50); + byteBuffers.add(DecimalType.instance.decompose(randomishDecimal)); + } + + byte[] bytes = new byte[100]; + prng.nextBytes(bytes); + ByteBuffer exhausted = ByteBuffer.wrap(bytes); + ByteBufferUtil.readBytes(exhausted, 100); + + List partitioners = Arrays.asList( + Murmur3Partitioner.instance, + RandomPartitioner.instance, + LengthPartitioner.instance + // NOTE LocalPartitioner, OrderPreservingPartitioner, and ByteOrderedPartitioner don't need a dedicated + // PartitionerDefinedOrder. + // 1) LocalPartitioner uses its inner AbstractType + // 2) OrderPreservingPartitioner uses UTF8Type + // 3) ByteOrderedPartitioner uses BytesType + ); + for (IPartitioner partitioner : partitioners) + { + AbstractType partitionOrdering = partitioner.partitionOrdering(); + Assert.assertTrue(partitionOrdering instanceof PartitionerDefinedOrder); + for (ByteBuffer input : byteBuffers) + { + ByteSource byteSource = partitionOrdering.asComparableBytes(input, version); + ByteBuffer output = partitionOrdering.fromComparableBytes(ByteSource.peekable(byteSource), version); + Assert.assertEquals("For partitioner " + partitioner.getClass().getSimpleName(), + ByteBufferUtil.bytesToHex(input), + ByteBufferUtil.bytesToHex(output)); + } + ByteSource byteSource = partitionOrdering.asComparableBytes(exhausted, version); + ByteBuffer output = partitionOrdering.fromComparableBytes(ByteSource.peekable(byteSource), version); + Assert.assertEquals(ByteBufferUtil.EMPTY_BYTE_BUFFER, output); + } + } + + @Test + public void testReversedType() + { + // Test how ReversedType handles null ByteSource.Peekable - here the choice of base type is important, as + // the base type should also be able to handle null ByteSource.Peekable. + ReversedType reversedVarintType = ReversedType.getInstance(IntegerType.instance); + ByteBuffer decodedNull = reversedVarintType.fromComparableBytes(null, ByteComparable.Version.OSS42); + Assert.assertEquals(ByteBufferUtil.EMPTY_BYTE_BUFFER, decodedNull); + + // Test how ReversedType handles random data with some common and important base types. + Map, BiFunction> bufferGeneratorByType = new HashMap<>(); + bufferGeneratorByType.put(UTF8Type.instance, (prng, length) -> UTF8Type.instance.decompose(newRandomAlphanumeric(prng, length))); + bufferGeneratorByType.put(BytesType.instance, (prng, length) -> + { + byte[] randomBytes = new byte[length]; + prng.nextBytes(randomBytes); + return ByteBuffer.wrap(randomBytes); + }); + bufferGeneratorByType.put(IntegerType.instance, (prng, length) -> + { + BigInteger randomVarint = BigInteger.valueOf(prng.nextLong()); + for (int i = 1; i < length / 8; ++i) + randomVarint = randomVarint.multiply(BigInteger.valueOf(prng.nextLong())); + return IntegerType.instance.decompose(randomVarint); + }); + bufferGeneratorByType.put(DecimalType.instance, (prng, length) -> + { + BigInteger randomMantissa = BigInteger.valueOf(prng.nextLong()); + for (int i = 1; i < length / 8; ++i) + randomMantissa = randomMantissa.multiply(BigInteger.valueOf(prng.nextLong())); + // Remove all trailing zeros from the mantissa and use an even scale, in order to have a "canonically + // represented" (in the context of DecimalType's encoding) decimal, i.e. one which wouldn't be re-scaled to + // conform with the "compacted mantissa between 0 and 1, scale as a power of 100" rule. + while (randomMantissa.remainder(BigInteger.TEN).equals(BigInteger.ZERO)) + randomMantissa = randomMantissa.divide(BigInteger.TEN); + int randomScale = prng.nextInt() & -2; + BigDecimal randomDecimal = new BigDecimal(randomMantissa, randomScale); + return DecimalType.instance.decompose(randomDecimal); + }); + Random prng = new Random(); + for (Map.Entry, BiFunction> entry : bufferGeneratorByType.entrySet()) + { + ReversedType reversedType = ReversedType.getInstance(entry.getKey()); + for (int length = 32; length <= 512; length *= 4) + { + for (int i = 0; i < 100; ++i) + { + ByteBuffer initial = entry.getValue().apply(prng, length); + ByteSource.Peekable reversedPeekable = ByteSource.peekable(reversedType.asComparableBytes(initial, ByteComparable.Version.OSS42)); + ByteBuffer decoded = reversedType.fromComparableBytes(reversedPeekable, ByteComparable.Version.OSS42); + Assert.assertEquals(initial, decoded); + } + } + } + } + + @Test + public void testSetType() + { + // Test sets with element components not having known/computable length (e.g. strings). + Random prng = new Random(); + List> stringSets = newRandomElementCollections(HashSet::new, + () -> newRandomAlphanumeric(prng, 10), + 100, + 100); + testValuesForType(SetType.getInstance(UTF8Type.instance, false), stringSets); + testValuesForType(SetType.getInstance(UTF8Type.instance, true), stringSets); + // Test sets with element components with known/computable length (e.g. 128-bit UUIDs). + List> uuidSets = newRandomElementCollections(HashSet::new, + UUID::randomUUID, + 100, + 100); + testValuesForType(SetType.getInstance(UUIDType.instance, false), uuidSets); + testValuesForType(SetType.getInstance(UUIDType.instance, true), uuidSets); + } + + @Test + public void testShortType() + { + testValuesForType(ShortType.instance, new Short[] { null }); + + Stream allShorts = IntStream.range(Short.MIN_VALUE, Short.MAX_VALUE + 1) + .mapToObj(value -> (short) value); + testValuesForType(ShortType.instance, allShorts); + } + + @Test + public void testSimpleDateType() + { + testValuesForType(SimpleDateType.instance, new Integer[] { null }); + + testValuesForType(SimpleDateType.instance, new Random().ints(1000).boxed()); + + // Test by manually creating and manually interpreting simple dates from random millis. + new Random().ints(1000).forEach(initialMillis -> + { + initialMillis = Math.abs(initialMillis); + Integer initialDays = SimpleDateSerializer.timeInMillisToDay(initialMillis); + ByteBuffer simpleDateBuffer = SimpleDateType.instance.fromTimeInMillis(initialMillis); + ByteSource byteSource = SimpleDateType.instance.asComparableBytes(simpleDateBuffer, version); + Integer decodedDays = SimpleDateType.instance.compose(SimpleDateType.instance.fromComparableBytes(ByteSource.peekable(byteSource), version)); + Assert.assertEquals(initialDays, decodedDays); + }); + + // Test by manually creating and manually interpreting simple dates from strings. + String[] simpleDateStrings = new String[] + { + "1970-01-01", + "1970-01-02", + "1969-12-31", + "-0001-01-02", + "-5877521-01-02", + "2014-01-01", + "+5881580-01-10", + "1920-12-01", + "1582-10-19" + }; + for (String simpleDate : simpleDateStrings) + { + ByteBuffer simpleDataBuffer = SimpleDateType.instance.fromString(simpleDate); + ByteSource byteSource = SimpleDateType.instance.asComparableBytes(simpleDataBuffer, version); + Integer decodedDays = SimpleDateType.instance.compose(SimpleDateType.instance.fromComparableBytes(ByteSource.peekable(byteSource), version)); + String decodedDate = SimpleDateSerializer.instance.toString(decodedDays); + Assert.assertEquals(simpleDate, decodedDate); + } + } + + @Test + public void testTimestampType() + { + Date[] dates = new Date[] + { + null, + new Date(), + new Date(0L), + new Date(-1L), + new Date(Long.MAX_VALUE), + new Date(Long.MIN_VALUE) + }; + testValuesForType(TimestampType.instance, dates); + testValuesForType(TimestampType.instance, new Random().longs(1000).mapToObj(Date::new)); + } + + @Test + public void testTimeType() + { + testValuesForType(TimeType.instance, new Long[] { null }); + + testValuesForType(TimeType.instance, new Random().longs(1000).boxed()); + } + + @Test + public void testTupleType() + { + TupleType tt = new TupleType(Arrays.asList(UTF8Type.instance, + DecimalType.instance, + IntegerType.instance, + BytesType.instance)); + Random prng = new Random(); + List tuplesData = new ArrayList<>(); + String[] utf8Values = new String[] + { + "a", + "©", + newRandomAlphanumeric(prng, 10), + newRandomAlphanumeric(prng, 100) + }; + BigDecimal[] decimalValues = new BigDecimal[] + { + null, + BigDecimal.ZERO, + BigDecimal.ONE, + BigDecimal.valueOf(1234567891011121314L, 50), + BigDecimal.valueOf(1234567891011121314L, 50).negate() + }; + BigInteger[] varintValues = new BigInteger[] + { + null, + BigInteger.ZERO, + BigInteger.TEN.pow(1000), + BigInteger.TEN.pow(1000).negate() + }; + byte[] oneByte = new byte[1]; + byte[] tenBytes = new byte[10]; + byte[] hundredBytes = new byte[100]; + byte[] thousandBytes = new byte[1000]; + prng.nextBytes(oneByte); + prng.nextBytes(tenBytes); + prng.nextBytes(hundredBytes); + prng.nextBytes(thousandBytes); + byte[][] bytesValues = new byte[][] + { + new byte[0], + oneByte, + tenBytes, + hundredBytes, + thousandBytes + }; + for (String utf8 : utf8Values) + { + for (BigDecimal decimal : decimalValues) + { + for (BigInteger varint : varintValues) + { + for (byte[] bytes : bytesValues) + { + ByteBuffer tupleData = TupleType.buildValue(UTF8Type.instance.decompose(utf8), + decimal != null ? DecimalType.instance.decompose(decimal) : null, + varint != null ? IntegerType.instance.decompose(varint) : null, + // We could also use the wrapped bytes directly + BytesType.instance.decompose(ByteBuffer.wrap(bytes))); + tuplesData.add(tupleData); + } + } + } + } + testValuesForType(tt, tuplesData.toArray(new ByteBuffer[0])); + } + + @Test + public void testUtf8Type() + { + Random prng = new Random(); + testValuesForType(UTF8Type.instance, Stream.generate(() -> newRandomAlphanumeric(prng, 100)).limit(1000)); + } + + @Test + public void testTypeWithByteOrderedComparison() + { + Random prng = new Random(); + byte[] singleByte = new byte[] { (byte) prng.nextInt() }; + byte[] tenBytes = new byte[10]; + prng.nextBytes(tenBytes); + byte[] hundredBytes = new byte[100]; + prng.nextBytes(hundredBytes); + byte[] thousandBytes = new byte[1000]; + prng.nextBytes(thousandBytes); + // No null here, as the default asComparableBytes(ByteBuffer, Version) implementation (and more specifically + // the ByteSource.of(ByteBuffer, Version) encoding) would throw then. + testValuesForType(ByteOrderedType.instance, Stream.of(ByteBufferUtil.EMPTY_BYTE_BUFFER, + ByteBuffer.wrap(singleByte), + ByteBuffer.wrap(tenBytes), + ByteBuffer.wrap(hundredBytes), + ByteBuffer.wrap(thousandBytes))); + } + + private static class ByteOrderedType extends AbstractType + { + public static final ByteOrderedType instance = new ByteOrderedType(); + + private ByteOrderedType() + { + super(ComparisonType.BYTE_ORDER); + } + + @Override + public ByteBuffer fromString(String source) throws MarshalException + { + return null; + } + + @Override + public Term fromJSONObject(Object parsed) throws MarshalException + { + return null; + } + + @Override + public TypeSerializer getSerializer() + { + return ByteOrderedSerializer.instance; + } + + static class ByteOrderedSerializer extends TypeSerializer + { + + static final ByteOrderedSerializer instance = new ByteOrderedSerializer(); + + @Override + public ByteBuffer serialize(ByteBuffer value) + { + return value != null ? value.duplicate() : null; + } + + @Override + public ByteBuffer deserialize(V bytes, ValueAccessor accessor) + { + return accessor.toBuffer(bytes); + } + + @Override + public void validate(V bytes, ValueAccessor accessor) throws MarshalException + { + + } + + @Override + public String toString(ByteBuffer value) + { + return ByteBufferUtil.bytesToHex(value); + } + + @Override + public Class getType() + { + return ByteBuffer.class; + } + } + } +} diff --git a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceComparisonTest.java b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceComparisonTest.java new file mode 100644 index 000000000000..f5cf2b639a17 --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceComparisonTest.java @@ -0,0 +1,1178 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.utils.bytecomparable; + +import java.math.BigDecimal; +import java.net.UnknownHostException; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.*; +import java.util.concurrent.ThreadLocalRandom; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.function.Supplier; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import com.google.common.collect.Ordering; +import org.junit.Assert; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.Util; +import org.apache.cassandra.db.Clustering; +import org.apache.cassandra.db.ClusteringComparator; +import org.apache.cassandra.db.ClusteringPrefix; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.marshal.*; +import org.apache.cassandra.dht.ByteOrderedPartitioner; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.LocalPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.RandomPartitioner; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.MurmurHash; +import org.apache.cassandra.utils.TimeUUID; +import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version; + +import static org.junit.Assert.assertEquals; + +/** + * Tests forward conversion to ByteSource/ByteComparable and that the result compares correctly. + */ +public class ByteSourceComparisonTest extends ByteSourceTestBase +{ + private final static Logger logger = LoggerFactory.getLogger(ByteSourceComparisonTest.class); + + @Rule + public final ExpectedException expectedException = ExpectedException.none(); + + @Test + public void testStringsAscii() + { + testType(AsciiType.instance, testStrings); + } + + @Test + public void testStringsUTF8() + { + testType(UTF8Type.instance, testStrings); + testDirect(x -> ByteSource.of(x, Version.OSS42), Ordering.natural()::compare, testStrings); + } + + @Test + public void testBooleans() + { + testType(BooleanType.instance, testBools); + } + + @Test + public void testInts() + { + testType(Int32Type.instance, testInts); + testDirect(x -> ByteSource.of(x), Integer::compare, testInts); + } + + @Test + public void randomTestInts() + { + Random rand = new Random(); + for (int i=0; i<10000; ++i) + { + int i1 = rand.nextInt(); + int i2 = rand.nextInt(); + assertComparesSame(Int32Type.instance, i1, i2); + } + + } + + @Test + public void testLongs() + { + testType(LongType.instance, testLongs); + testDirect(x -> ByteSource.of(x), Long::compare, testLongs); + } + + @Test + public void testShorts() + { + testType(ShortType.instance, testShorts); + } + + @Test + public void testBytes() + { + testType(ByteType.instance, testBytes); + } + + @Test + public void testDoubles() + { + testType(DoubleType.instance, testDoubles); + } + + @Test + public void testFloats() + { + testType(FloatType.instance, testFloats); + } + + @Test + public void testBigInts() + { + testType(IntegerType.instance, testBigInts); + } + + @Test + public void testBigDecimals() + { + testType(DecimalType.instance, testBigDecimals); + } + + @Test + public void testBigDecimalInCombination() + { + BigDecimal b1 = new BigDecimal("123456.78901201"); + BigDecimal b2 = new BigDecimal("123456.789012"); + Boolean b = false; + + assertClusteringPairComparesSame(DecimalType.instance, BooleanType.instance, b1, b, b2, b); + assertClusteringPairComparesSame(BooleanType.instance, DecimalType.instance, b, b1, b, b2); + + b1 = b1.negate(); + b2 = b2.negate(); + + assertClusteringPairComparesSame(DecimalType.instance, BooleanType.instance, b1, b, b2, b); + assertClusteringPairComparesSame(BooleanType.instance, DecimalType.instance, b, b1, b, b2); + + b1 = new BigDecimal("-123456.78901289"); + b2 = new BigDecimal("-123456.789012"); + + assertClusteringPairComparesSame(DecimalType.instance, BooleanType.instance, b1, b, b2, b); + assertClusteringPairComparesSame(BooleanType.instance, DecimalType.instance, b, b1, b, b2); + + b1 = new BigDecimal("1"); + b2 = new BigDecimal("1.1"); + + assertClusteringPairComparesSame(DecimalType.instance, BooleanType.instance, b1, b, b2, b); + assertClusteringPairComparesSame(BooleanType.instance, DecimalType.instance, b, b1, b, b2); + + b1 = b1.negate(); + b2 = b2.negate(); + + assertClusteringPairComparesSame(DecimalType.instance, BooleanType.instance, b1, b, b2, b); + assertClusteringPairComparesSame(BooleanType.instance, DecimalType.instance, b, b1, b, b2); + } + + @Test + public void testUUIDs() + { + testType(UUIDType.instance, testUUIDs); + } + + @Test + public void testTimeUUIDs() + { + testType(TimeUUIDType.instance, Arrays.stream(testUUIDs) + .filter(x -> x == null || x.version() == 1) + .map(x -> x != null ? TimeUUID.fromUuid(x) : null) + .toArray()); + } + + @Test + public void testLexicalUUIDs() + { + testType(LexicalUUIDType.instance, testUUIDs); + } + + @Test + public void testSimpleDate() + { + testType(SimpleDateType.instance, Arrays.stream(testInts).filter(x -> x != null).toArray()); + } + + @Test + public void testTimeType() + { + testType(TimeType.instance, Arrays.stream(testLongs).filter(x -> x != null && x >= 0 && x <= 24L * 60 * 60 * 1000 * 1000 * 1000).toArray()); + } + + @SuppressWarnings("deprecation") + @Test + public void testDateType() + { + testType(DateType.instance, testDates); + } + + @Test + public void testTimestampType() + { + testType(TimestampType.instance, testDates); + } + + @Test + public void testBytesType() + { + List values = new ArrayList<>(); + for (int i = 0; i < testValues.length; ++i) + for (Object o : testValues[i]) + values.add(testTypes[i].decompose(o)); + + testType(BytesType.instance, values.toArray()); + } + + @Test + public void testInetAddressType() throws UnknownHostException + { + testType(InetAddressType.instance, testInets); + } + + @Test + public void testEmptyType() + { + testType(EmptyType.instance, new Void[] { null }); + } + + @Test + public void testPatitionerDefinedOrder() + { + List values = new ArrayList<>(); + for (int i = 0; i < testValues.length; ++i) + for (Object o : testValues[i]) + values.add(testTypes[i].decompose(o)); + + testBuffers(new PartitionerDefinedOrder(Murmur3Partitioner.instance), values); + testBuffers(new PartitionerDefinedOrder(RandomPartitioner.instance), values); + testBuffers(new PartitionerDefinedOrder(ByteOrderedPartitioner.instance), values); + } + + @Test + public void testPatitionerOrder() + { + List values = new ArrayList<>(); + for (int i = 0; i < testValues.length; ++i) + for (Object o : testValues[i]) + values.add(testTypes[i].decompose(o)); + + testDecoratedKeys(Murmur3Partitioner.instance, values); + testDecoratedKeys(RandomPartitioner.instance, values); + testDecoratedKeys(ByteOrderedPartitioner.instance, values); + } + + @Test + public void testLocalPatitionerOrder() + { + for (int i = 0; i < testValues.length; ++i) + { + final AbstractType testType = testTypes[i]; + testDecoratedKeys(new LocalPartitioner(testType), Lists.transform(Arrays.asList(testValues[i]), + v -> testType.decompose(v))); + } + } + + interface PairTester + { + void test(AbstractType t1, AbstractType t2, Object o1, Object o2, Object o3, Object o4); + } + + void testCombinationSampling(Random rand, PairTester tester) + { + for (int i=0;i (ByteBuffer) v, + input1[0] != null && input1[1] != null && input2[0] != null && input2[1] != null); + } + } + + @Test + public void testNullsInClusteringLegacy() + { + // verify the legacy encoding treats null clustering the same as null value + ClusteringPrefix aNull = makeBound(ClusteringPrefix.Kind.CLUSTERING, + decomposeAndRandomPad(UTF8Type.instance, "a"), + decomposeAndRandomPad(Int32Type.instance, null)); + ClusteringPrefix aEmpty = makeBound(ClusteringPrefix.Kind.CLUSTERING, + decomposeAndRandomPad(UTF8Type.instance, "a"), + null); + ClusteringComparator comp = new ClusteringComparator(UTF8Type.instance, Int32Type.instance); + assertEquals(0, ByteComparable.compare(comp.asByteComparable(aNull), comp.asByteComparable(aEmpty), Version.LEGACY)); + ClusteringComparator compReversed = new ClusteringComparator(UTF8Type.instance, ReversedType.getInstance(Int32Type.instance)); + assertEquals(0, ByteComparable.compare(compReversed.asByteComparable(aNull), compReversed.asByteComparable(aEmpty), Version.LEGACY)); + } + + @Test + public void testEmptyClustering() + { + assertEmptyComparedToStatic(1, ClusteringPrefix.Kind.CLUSTERING, Version.OSS42); + assertEmptyComparedToStatic(0, ClusteringPrefix.Kind.STATIC_CLUSTERING, Version.OSS42); + assertEmptyComparedToStatic(1, ClusteringPrefix.Kind.INCL_START_BOUND, Version.OSS42); + assertEmptyComparedToStatic(1, ClusteringPrefix.Kind.INCL_END_BOUND, Version.OSS42); + + assertEmptyComparedToStatic(1, ClusteringPrefix.Kind.CLUSTERING, Version.LEGACY); + assertEmptyComparedToStatic(0, ClusteringPrefix.Kind.STATIC_CLUSTERING, Version.LEGACY); + assertEmptyComparedToStatic(-1, ClusteringPrefix.Kind.INCL_START_BOUND, Version.LEGACY); + assertEmptyComparedToStatic(1, ClusteringPrefix.Kind.INCL_END_BOUND, Version.LEGACY); + } + + private void assertEmptyComparedToStatic(int expected, ClusteringPrefix.Kind kind, Version version) + { + ClusteringPrefix empty = makeBound(kind); + ClusteringComparator compEmpty = new ClusteringComparator(); + assertEquals(expected, Integer.signum(ByteComparable.compare(compEmpty.asByteComparable(empty), + compEmpty.asByteComparable(Clustering.STATIC_CLUSTERING), + version))); + } + + void assertClusteringPairComparesSame(AbstractType t1, AbstractType t2, Object o1, Object o2, Object o3, Object o4) + { + assertClusteringPairComparesSame(t1, t2, o1, o2, o3, o4, AbstractType::decompose, true); + } + + void assertClusteringPairComparesSame(AbstractType t1, AbstractType t2, + Object o1, Object o2, Object o3, Object o4, + BiFunction decompose, + boolean testLegacy) + { + for (Version v : Version.values()) + for (ClusteringPrefix.Kind k1 : ClusteringPrefix.Kind.values()) + for (ClusteringPrefix.Kind k2 : ClusteringPrefix.Kind.values()) + { + if (!testLegacy && v == Version.LEGACY) + continue; + + ClusteringComparator comp = new ClusteringComparator(t1, t2); + ByteBuffer[] b = new ByteBuffer[2]; + ByteBuffer[] d = new ByteBuffer[2]; + b[0] = decompose.apply(t1, o1); + b[1] = decompose.apply(t2, o2); + d[0] = decompose.apply(t1, o3); + d[1] = decompose.apply(t2, o4); + ClusteringPrefix c = makeBound(k1, b); + ClusteringPrefix e = makeBound(k2, d); + final ByteComparable bsc = comp.asByteComparable(c); + final ByteComparable bse = comp.asByteComparable(e); + int expected = Integer.signum(comp.compare(c, e)); + assertEquals(String.format("Failed comparing %s and %s, %s vs %s version %s", + safeStr(c.clusteringString(comp.subtypes())), + safeStr(e.clusteringString(comp.subtypes())), bsc, bse, v), + expected, Integer.signum(ByteComparable.compare(bsc, bse, v))); + maybeCheck41Properties(expected, bsc, bse, v); + maybeAssertNotPrefix(bsc, bse, v); + + ClusteringComparator compR = new ClusteringComparator(ReversedType.getInstance(t1), ReversedType.getInstance(t2)); + final ByteComparable bsrc = compR.asByteComparable(c); + final ByteComparable bsre = compR.asByteComparable(e); + int expectedR = Integer.signum(compR.compare(c, e)); + assertEquals(String.format("Failed comparing reversed %s and %s, %s vs %s version %s", + safeStr(c.clusteringString(comp.subtypes())), + safeStr(e.clusteringString(comp.subtypes())), bsrc, bsre, v), + expectedR, Integer.signum(ByteComparable.compare(bsrc, bsre, v))); + maybeCheck41Properties(expectedR, bsrc, bsre, v); + maybeAssertNotPrefix(bsrc, bsre, v); + } + } + + static ClusteringPrefix makeBound(ClusteringPrefix.Kind k1, ByteBuffer... b) + { + return makeBound(ByteBufferAccessor.instance.factory(), k1, b); + } + + static ClusteringPrefix makeBound(ValueAccessor.ObjectFactory factory, ClusteringPrefix.Kind k1, T[] b) + { + switch (k1) + { + case INCL_END_EXCL_START_BOUNDARY: + case EXCL_END_INCL_START_BOUNDARY: + return factory.boundary(k1, b); + + case INCL_END_BOUND: + case EXCL_END_BOUND: + case INCL_START_BOUND: + case EXCL_START_BOUND: + return factory.bound(k1, b); + + case CLUSTERING: + return factory.clustering(b); + + case STATIC_CLUSTERING: + return factory.staticClustering(); + + default: + throw new AssertionError(); + } + } + + @Test + public void testTupleType() + { + Random rand = ThreadLocalRandom.current(); + testCombinationSampling(rand, this::assertTupleComparesSame); + } + + @Test + public void testTupleTypeNonFull() + { + TupleType tt = new TupleType(ImmutableList.of(UTF8Type.instance, Int32Type.instance)); + List tests = ImmutableList.of + ( + TupleType.buildValue(ByteBufferAccessor.instance, + decomposeAndRandomPad(UTF8Type.instance, ""), + decomposeAndRandomPad(Int32Type.instance, 0)), + // Note: a decomposed null (e.g. decomposeAndRandomPad(Int32Type.instance, null)) should not reach a tuple + TupleType.buildValue(ByteBufferAccessor.instance, + decomposeAndRandomPad(UTF8Type.instance, ""), + null), + TupleType.buildValue(ByteBufferAccessor.instance, + null, + decomposeAndRandomPad(Int32Type.instance, 0)), + TupleType.buildValue(ByteBufferAccessor.instance, + decomposeAndRandomPad(UTF8Type.instance, "")), + TupleType.buildValue(ByteBufferAccessor.instance, (ByteBuffer) null), + TupleType.buildValue(ByteBufferAccessor.instance) + ); + testBuffers(tt, tests); + } + + @Test + public void testTupleNewField() + { + TupleType t1 = new TupleType(ImmutableList.of(UTF8Type.instance)); + TupleType t2 = new TupleType(ImmutableList.of(UTF8Type.instance, Int32Type.instance)); + + ByteBuffer vOne = TupleType.buildValue(ByteBufferAccessor.instance, + decomposeAndRandomPad(UTF8Type.instance, "str")); + ByteBuffer vOneAndNull = TupleType.buildValue(ByteBufferAccessor.instance, + decomposeAndRandomPad(UTF8Type.instance, "str"), + null); + + ByteComparable bOne1 = typeToComparable(t1, vOne); + ByteComparable bOne2 = typeToComparable(t2, vOne); + ByteComparable bOneAndNull2 = typeToComparable(t2, vOneAndNull); + + assertEquals("The byte-comparable version of a one-field tuple must be the same as a two-field tuple with non-present second component.", + bOne1.byteComparableAsString(Version.OSS42), + bOne2.byteComparableAsString(Version.OSS42)); + assertEquals("The byte-comparable version of a one-field tuple must be the same as a two-field tuple with null as second component.", + bOne1.byteComparableAsString(Version.OSS42), + bOneAndNull2.byteComparableAsString(Version.OSS42)); + } + + + void assertTupleComparesSame(AbstractType t1, AbstractType t2, Object o1, Object o2, Object o3, Object o4) + { + TupleType tt = new TupleType(ImmutableList.of(t1, t2)); + ByteBuffer b1 = TupleType.buildValue(ByteBufferAccessor.instance, + decomposeForTuple(t1, o1), + decomposeForTuple(t2, o2)); + ByteBuffer b2 = TupleType.buildValue(ByteBufferAccessor.instance, + decomposeForTuple(t1, o3), + decomposeForTuple(t2, o4)); + assertComparesSameBuffers(tt, b1, b2); + } + + static ByteBuffer decomposeForTuple(AbstractType t, T o) + { + return o != null ? t.decompose(o) : null; + } + + @Test + public void testCompositeType() + { + Random rand = new Random(0); + testCombinationSampling(rand, this::assertCompositeComparesSame); + } + + @Test + public void testCompositeTypeNonFull() + { + CompositeType tt = CompositeType.getInstance(UTF8Type.instance, Int32Type.instance); + List tests = ImmutableList.of + ( + CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(UTF8Type.instance, ""), decomposeAndRandomPad(Int32Type.instance, 0)), + CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(UTF8Type.instance, ""), decomposeAndRandomPad(Int32Type.instance, null)), + CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(UTF8Type.instance, "")), + CompositeType.build(ByteBufferAccessor.instance), + CompositeType.build(ByteBufferAccessor.instance, true, decomposeAndRandomPad(UTF8Type.instance, "")), + CompositeType.build(ByteBufferAccessor.instance,true) + ); + for (ByteBuffer b : tests) + tt.validate(b); + testBuffers(tt, tests); + } + + void assertCompositeComparesSame(AbstractType t1, AbstractType t2, Object o1, Object o2, Object o3, Object o4) + { + CompositeType tt = CompositeType.getInstance(t1, t2); + ByteBuffer b1 = CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(t1, o1), decomposeAndRandomPad(t2, o2)); + ByteBuffer b2 = CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(t1, o3), decomposeAndRandomPad(t2, o4)); + assertComparesSameBuffers(tt, b1, b2); + } + + @Test + public void testDynamicComposite() + { + DynamicCompositeType tt = DynamicCompositeType.getInstance(DynamicCompositeTypeTest.aliases); + UUID[] uuids = DynamicCompositeTypeTest.uuids; + List tests = ImmutableList.of + ( + DynamicCompositeTypeTest.createDynamicCompositeKey("test1", null, -1, false, true), + DynamicCompositeTypeTest.createDynamicCompositeKey("test1", uuids[0], 24, false, true), + DynamicCompositeTypeTest.createDynamicCompositeKey("test1", uuids[0], 42, false, true), + DynamicCompositeTypeTest.createDynamicCompositeKey("test2", uuids[0], -1, false, true), + DynamicCompositeTypeTest.createDynamicCompositeKey("test2", uuids[1], 42, false, true) + ); + for (ByteBuffer b : tests) + tt.validate(b); + testBuffers(tt, tests); + } + + @Test + public void testListTypeString() + { + testCollection(ListType.getInstance(UTF8Type.instance, true), testStrings, () -> new ArrayList<>(), new Random()); + } + + @Test + public void testListTypeLong() + { + testCollection(ListType.getInstance(LongType.instance, true), testLongs, () -> new ArrayList<>(), new Random()); + } + + @Test + public void testSetTypeString() + { + testCollection(SetType.getInstance(UTF8Type.instance, true), testStrings, () -> new HashSet<>(), new Random()); + } + + @Test + public void testSetTypeLong() + { + testCollection(SetType.getInstance(LongType.instance, true), testLongs, () -> new HashSet<>(), new Random()); + } + + > void testCollection(CollectionType tt, T[] values, Supplier gen, Random rand) + { + int cnt = 0; + List tests = new ArrayList<>(); + tests.add(gen.get()); + for (int c = 1; c <= 3; ++c) + for (int j = 0; j < 5; ++j) + { + CT l = gen.get(); + for (int i = 0; i < c; ++i) + l.add(values[cnt++ % values.length]); + + tests.add(l); + } + testType(tt, tests); + } + + @Test + public void testMapTypeStringLong() + { + testMap(MapType.getInstance(UTF8Type.instance, LongType.instance, true), testStrings, testLongs, () -> new HashMap<>(), new Random()); + } + + @Test + public void testMapTypeStringLongTree() + { + testMap(MapType.getInstance(UTF8Type.instance, LongType.instance, true), testStrings, testLongs, () -> new TreeMap<>(), new Random()); + } + + @Test + public void testDecoratedKeyPrefixesVOSS42() + { + // This should pass with the OSS 4.1 encoding + testDecoratedKeyPrefixes(Version.OSS42); + } + + @Test + public void testDecoratedKeyPrefixesVLegacy() + { + // ... and fail with the legacy encoding + try + { + testDecoratedKeyPrefixes(Version.LEGACY); + } + catch (AssertionError e) + { + // Correct path, test failing. + return; + } + Assert.fail("Test expected to fail."); + } + + @Test + public void testFixedLengthWithOffset() + { + byte[] bytes = new byte[]{ 1, 2, 3, 4, 5, 6, 7, 8, 9 }; + + ByteSource source = ByteSource.fixedLength(bytes, 0, 1); + assertEquals(1, source.next()); + assertEquals(ByteSource.END_OF_STREAM, source.next()); + + source = ByteSource.fixedLength(bytes, 4, 5); + assertEquals(5, source.next()); + assertEquals(6, source.next()); + assertEquals(7, source.next()); + assertEquals(8, source.next()); + assertEquals(9, source.next()); + assertEquals(ByteSource.END_OF_STREAM, source.next()); + + ByteSource.fixedLength(bytes, 9, 0); + assertEquals(ByteSource.END_OF_STREAM, source.next()); + } + + @Test + public void testFixedLengthNegativeLength() + { + byte[] bytes = new byte[]{ 1, 2, 3 }; + + expectedException.expect(IllegalArgumentException.class); + ByteSource.fixedLength(bytes, 0, -1); + } + + @Test + public void testFixedLengthNegativeOffset() + { + byte[] bytes = new byte[]{ 1, 2, 3 }; + + expectedException.expect(IllegalArgumentException.class); + ByteSource.fixedLength(bytes, -1, 1); + } + + @Test + public void testFixedLengthOutOfBounds() + { + byte[] bytes = new byte[]{ 1, 2, 3 }; + + expectedException.expect(IllegalArgumentException.class); + ByteSource.fixedLength(bytes, 0, 4); + } + + @Test + public void testFixedOffsetOutOfBounds() + { + byte[] bytes = new byte[]{ 1, 2, 3 }; + + expectedException.expect(IllegalArgumentException.class); + ByteSource.fixedLength(bytes, 4, 1); + } + + @Test + public void testSeparatorGT() + { + testSeparator(ByteComparable::separatorGt, testLongs, LongType.instance); + } + + @Test + public void testSeparatorPrefix() + { + testSeparator(ByteComparable::separatorPrefix, testLongs, LongType.instance); + } + + @Test + public void testSeparatorPrefixViaDiffPoint() + { + testSeparator((x, y) -> version -> ByteSource.cut(y.asComparableBytes(version), + ByteComparable.diffPoint(x, y, version)), + testLongs, + LongType.instance); + } + @Test + public void testSeparatorNext() + { + // Appending a 00 byte at the end gives the immediate next possible value after x. + testSeparator((x, y) -> version -> ByteSource.cutOrRightPad(x.asComparableBytes(version), + ByteComparable.length(x, version) + 1, + 0), + testLongs, + LongType.instance); + } + + private void testSeparator(BiFunction separatorMethod, T[] testValues, AbstractType type) + { + for (T v1 : testValues) + for (T v2 : testValues) + { + if (v1 == null || v2 == null) + continue; + if (type.compare(type.decompose(v1), type.decompose(v2)) >= 0) + continue; + ByteComparable bc1 = getByteComparable(type, v1); + ByteComparable bc2 = getByteComparable(type, v2); + ByteComparable separator = separatorMethod.apply(bc1, bc2); + + for (Version version : Version.values()) + { + Assert.assertTrue("Sanity check failed", ByteComparable.compare(bc1, bc2, version) < 0); + Assert.assertTrue(String.format("Separator %s must be greater than left %s (for %s) (version %s)", + separator.byteComparableAsString(version), + bc1.byteComparableAsString(version), + v1, + version), + ByteComparable.compare(bc1, separator, version) < 0); + Assert.assertTrue(String.format("Separator %s must be less than or equal to right %s (for %s) (version %s)", + separator.byteComparableAsString(version), + bc2.byteComparableAsString(version), + v2, + version), + ByteComparable.compare(separator, bc2, version) <= 0); + } + } + } + + private ByteComparable getByteComparable(AbstractType type, T v1) + { + return version -> type.asComparableBytes(type.decompose(v1), version); + } + + public void testDecoratedKeyPrefixes(Version version) + { + testDecoratedKeyPrefixes("012345678BCDE\0", "", version); + testDecoratedKeyPrefixes("012345678ABCDE\0", "ABC", version); + testDecoratedKeyPrefixes("0123456789ABCDE\0", "\0AB", version); + testDecoratedKeyPrefixes("0123456789ABCDEF\0", "\0", version); + + testDecoratedKeyPrefixes("0123456789ABCDEF0", "ABC", version); + testDecoratedKeyPrefixes("0123456789ABCDEF", "", version); + testDecoratedKeyPrefixes("0123456789ABCDE", "", version); + testDecoratedKeyPrefixes("0123456789ABCD", "\0AB", version); + testDecoratedKeyPrefixes("0123456789ABC", "\0", version); + + } + + public void testDecoratedKeyPrefixes(String key, String append, Version version) + { + logger.info("Testing {} + {}", safeStr(key), safeStr(append)); + IPartitioner partitioner = Murmur3Partitioner.instance; + ByteBuffer original = ByteBufferUtil.bytes(key); + ByteBuffer collision = Util.generateMurmurCollision(original, append.getBytes(StandardCharsets.UTF_8)); + + long[] hash = new long[2]; + MurmurHash.hash3_x64_128(original, 0, original.limit(), 0, hash); + logger.info(String.format("Original hash %016x,%016x", hash[0], hash[1])); + MurmurHash.hash3_x64_128(collision, 0, collision.limit(), 0, hash); + logger.info(String.format("Collision hash %016x,%016x", hash[0], hash[1])); + + DecoratedKey kk1 = partitioner.decorateKey(original); + DecoratedKey kk2 = partitioner.decorateKey(collision); + logger.info("{}\n{}\n{}\n{}", kk1, kk2, kk1.byteComparableAsString(version), kk2.byteComparableAsString(version)); + + final ByteSource s1 = kk1.asComparableBytes(version); + final ByteSource s2 = kk2.asComparableBytes(version); + logger.info("{}\n{}", s1, s2); + + // Check that the representations compare correctly + Assert.assertEquals(Long.signum(kk1.compareTo(kk2)), ByteComparable.compare(kk1, kk2, version)); + // s1 must not be a prefix of s2 + assertNotPrefix(s1, s2); + } + + private void assertNotPrefix(ByteSource s1, ByteSource s2) + { + int c1, c2; + do + { + c1 = s1.next(); + c2 = s2.next(); + } + while (c1 == c2 && c1 != ByteSource.END_OF_STREAM); + + // Equal is ok + if (c1 == c2) + return; + + Assert.assertNotEquals("ByteComparable is a prefix of other", ByteSource.END_OF_STREAM, c1); + Assert.assertNotEquals("ByteComparable is a prefix of other", ByteSource.END_OF_STREAM, c2); + } + + private int compare(ByteSource s1, ByteSource s2) + { + int c1, c2; + do + { + c1 = s1.next(); + c2 = s2.next(); + } + while (c1 == c2 && c1 != ByteSource.END_OF_STREAM); + + return Integer.compare(c1, c2); + } + + private void maybeAssertNotPrefix(ByteComparable s1, ByteComparable s2, Version version) + { + if (version == Version.OSS42) + assertNotPrefix(s1.asComparableBytes(version), s2.asComparableBytes(version)); + } + + private void maybeCheck41Properties(int expectedComparison, ByteComparable s1, ByteComparable s2, Version version) + { + if (version != Version.OSS42) + return; + + if (s1 == null || s2 == null || 0 == expectedComparison) + return; + int b1 = randomTerminator(); + int b2 = randomTerminator(); + assertEquals(String.format("Comparison failed for %s(%s + %02x) and %s(%s + %02x)", s1, s1.byteComparableAsString(version), b1, s2, s2.byteComparableAsString(version), b2), + expectedComparison, Integer.signum(compare(ByteSource.withTerminator(b1, s1.asComparableBytes(version)), ByteSource.withTerminator(b2, s2.asComparableBytes(version))))); + assertNotPrefix(ByteSource.withTerminator(b1, s1.asComparableBytes(version)), ByteSource.withTerminator(b2, s2.asComparableBytes(version))); + } + + private int randomTerminator() + { + int term; + do + { + term = ThreadLocalRandom.current().nextInt(ByteSource.MIN_SEPARATOR, ByteSource.MAX_SEPARATOR + 1); + } + while (term >= ByteSource.MIN_NEXT_COMPONENT && term <= ByteSource.MAX_NEXT_COMPONENT); + return term; + } + + > void testMap(MapType tt, K[] keys, V[] values, Supplier gen, Random rand) + { + List tests = new ArrayList<>(); + tests.add(gen.get()); + for (int c = 1; c <= 3; ++c) + for (int j = 0; j < 5; ++j) + { + M l = gen.get(); + for (int i = 0; i < c; ++i) + l.put(keys[rand.nextInt(keys.length)], values[rand.nextInt(values.length)]); + + tests.add(l); + } + testType(tt, tests); + } + + /* + * Convert type to a comparable. + */ + private ByteComparable typeToComparable(AbstractType type, ByteBuffer value) + { + return new ByteComparable() + { + @Override + public ByteSource asComparableBytes(Version v) + { + return type.asComparableBytes(value, v); + } + + @Override + public String toString() + { + return type.getString(value); + } + }; + } + + public void testType(AbstractType type, Object[] values) + { + testType(type, Iterables.transform(Arrays.asList(values), x -> (T) x)); + } + + public void testType(AbstractType type, Iterable values) + { + for (T i : values) { + ByteBuffer b = decomposeAndRandomPad(type, i); + logger.info("Value {} ({}) bytes {} ByteSource {}", + safeStr(i), + safeStr(type.getSerializer().toCQLLiteral(b)), + safeStr(ByteBufferUtil.bytesToHex(b)), + typeToComparable(type, b).byteComparableAsString(Version.OSS42)); + } + for (T i : values) + for (T j : values) + assertComparesSame(type, i, j); + if (!type.isReversed()) + testType(ReversedType.getInstance(type), values); + } + + public void testBuffers(AbstractType type, List values) + { + try + { + for (ByteBuffer b : values) { + logger.info("Value {} bytes {} ByteSource {}", + safeStr(type.getSerializer().toCQLLiteral(b)), + safeStr(ByteBufferUtil.bytesToHex(b)), + typeToComparable(type, b).byteComparableAsString(Version.OSS42)); + } + } + catch (UnsupportedOperationException e) + { + // Continue without listing values. + } + + for (ByteBuffer i : values) + for (ByteBuffer j : values) + assertComparesSameBuffers(type, i, j); + } + + void assertComparesSameBuffers(AbstractType type, ByteBuffer b1, ByteBuffer b2) + { + int expected = Integer.signum(type.compare(b1, b2)); + final ByteComparable bs1 = typeToComparable(type, b1); + final ByteComparable bs2 = typeToComparable(type, b2); + + for (Version version : Version.values()) + { + int actual = Integer.signum(ByteComparable.compare(bs1, bs2, version)); + assertEquals(String.format("Failed comparing %s(%s) and %s(%s)", ByteBufferUtil.bytesToHex(b1), bs1.byteComparableAsString(version), ByteBufferUtil.bytesToHex(b2), bs2.byteComparableAsString(version)), + expected, + actual); + maybeCheck41Properties(expected, bs1, bs2, version); + } + } + + public void testDecoratedKeys(IPartitioner type, List values) + { + for (ByteBuffer i : values) + for (ByteBuffer j : values) + assertComparesSameDecoratedKeys(type, i, j); + for (ByteBuffer i : values) + assertDecoratedKeyBounds(type, i); + } + + void assertComparesSameDecoratedKeys(IPartitioner type, ByteBuffer b1, ByteBuffer b2) + { + DecoratedKey k1 = type.decorateKey(b1); + DecoratedKey k2 = type.decorateKey(b2); + int expected = Integer.signum(k1.compareTo(k2)); + + for (Version version : Version.values()) + { + int actual = Integer.signum(ByteComparable.compare(k1, k2, version)); + assertEquals(String.format("Failed comparing %s[%s](%s) and %s[%s](%s)\npartitioner %s version %s", + ByteBufferUtil.bytesToHex(b1), + k1, + k1.byteComparableAsString(version), + ByteBufferUtil.bytesToHex(b2), + k2, + k2.byteComparableAsString(version), + type, + version), + expected, + actual); + maybeAssertNotPrefix(k1, k2, version); + } + } + + void assertDecoratedKeyBounds(IPartitioner type, ByteBuffer b) + { + Version version = Version.OSS42; + DecoratedKey k = type.decorateKey(b); + final ByteComparable after = k.asComparableBound(false); + final ByteComparable before = k.asComparableBound(true); + + int actual = Integer.signum(ByteComparable.compare(k, before, version)); + assertEquals(String.format("Failed comparing bound before (%s) for %s[%s](%s)\npartitioner %s version %s", + before.byteComparableAsString(version), + ByteBufferUtil.bytesToHex(b), + k, + k.byteComparableAsString(version), + type, + version), + 1, + actual); + maybeAssertNotPrefix(k, before, version); + + actual = Integer.signum(ByteComparable.compare(k, after, version)); + assertEquals(String.format("Failed comparing bound after (%s) for %s[%s](%s)\npartitioner %s version %s", + after.byteComparableAsString(version), + ByteBufferUtil.bytesToHex(b), + k, + k.byteComparableAsString(version), + type, + version), + -1, + actual); + maybeAssertNotPrefix(k, after, version); + + actual = Integer.signum(ByteComparable.compare(before, after, version)); + assertEquals(String.format("Failed comparing bound before (%s) to after (%s) for %s[%s](%s)\npartitioner %s version %s", + before.byteComparableAsString(version), + after.byteComparableAsString(version), + ByteBufferUtil.bytesToHex(b), + k, + k.byteComparableAsString(version), + type, + version), + -1, + actual); + maybeAssertNotPrefix(after, before, version); + } + + static Object safeStr(Object i) + { + if (i == null) + return null; + String s = i.toString(); + if (s.length() > 100) + s = s.substring(0, 100) + "..."; + return s.replaceAll("\0", "<0>"); + } + + public void testDirect(Function convertor, BiFunction comparator, T[] values) + { + for (T i : values) { + if (i == null) + continue; + + logger.info("Value {} ByteSource {}\n", + safeStr(i), + convertor.apply(i)); + } + for (T i : values) + if (i != null) + for (T j : values) + if (j != null) + assertComparesSame(convertor, comparator, i, j); + } + + void assertComparesSame(Function convertor, BiFunction comparator, T v1, T v2) + { + ByteComparable b1 = v -> convertor.apply(v1); + ByteComparable b2 = v -> convertor.apply(v2); + int expected = Integer.signum(comparator.apply(v1, v2)); + int actual = Integer.signum(ByteComparable.compare(b1, b2, null)); // version ignored above + assertEquals(String.format("Failed comparing %s and %s", v1, v2), expected, actual); + } + + void assertComparesSame(AbstractType type, T v1, T v2) + { + ByteBuffer b1 = decomposeAndRandomPad(type, v1); + ByteBuffer b2 = decomposeAndRandomPad(type, v2); + int expected = Integer.signum(type.compare(b1, b2)); + final ByteComparable bc1 = typeToComparable(type, b1); + final ByteComparable bc2 = typeToComparable(type, b2); + + for (Version version : Version.values()) + { + int actual = Integer.signum(ByteComparable.compare(bc1, bc2, version)); + if (expected != actual) + { + if (type.isReversed()) + { + // This can happen for reverse of nulls and prefixes. Check that it's ok within multi-component + ClusteringComparator cc = new ClusteringComparator(type); + ByteComparable c1 = cc.asByteComparable(Clustering.make(b1)); + ByteComparable c2 = cc.asByteComparable(Clustering.make(b2)); + int actualcc = Integer.signum(ByteComparable.compare(c1, c2, version)); + if (actualcc == expected) + return; + assertEquals(String.format("Failed comparing reversed %s(%s, %s) and %s(%s, %s) direct (%d) and as clustering", safeStr(v1), ByteBufferUtil.bytesToHex(b1), c1, safeStr(v2), ByteBufferUtil.bytesToHex(b2), c2, actual), expected, actualcc); + } + else + assertEquals(String.format("Failed comparing %s(%s BC %s) and %s(%s BC %s) version %s", + safeStr(v1), + ByteBufferUtil.bytesToHex(b1), + bc1.byteComparableAsString(version), + safeStr(v2), + ByteBufferUtil.bytesToHex(b2), + bc2.byteComparableAsString(version), + version), + expected, + actual); + } + maybeCheck41Properties(expected, bc1, bc2, version); + } + } + + ByteBuffer decomposeAndRandomPad(AbstractType type, T v) + { + ByteBuffer b = type.decompose(v); + Random rand = new Random(0); + int padBefore = rand.nextInt(16); + int padAfter = rand.nextInt(16); + int paddedCapacity = b.remaining() + padBefore + padAfter; + ByteBuffer padded = allocateBuffer(paddedCapacity); + rand.ints(padBefore).forEach(x -> padded.put((byte) x)); + padded.put(b.duplicate()); + rand.ints(padAfter).forEach(x -> padded.put((byte) x)); + padded.clear().limit(padded.capacity() - padAfter).position(padBefore); + return padded; + } + + protected ByteBuffer allocateBuffer(int paddedCapacity) + { + return ByteBuffer.allocate(paddedCapacity); + } +} diff --git a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceConversionTest.java b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceConversionTest.java new file mode 100644 index 000000000000..5a59ddfe1fc6 --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceConversionTest.java @@ -0,0 +1,784 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.utils.bytecomparable; + +import java.net.UnknownHostException; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.*; +import java.util.concurrent.ThreadLocalRandom; +import java.util.function.BiFunction; +import java.util.function.Function; +import java.util.function.Supplier; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.Iterables; +import com.google.common.collect.Lists; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.db.BufferDecoratedKey; +import org.apache.cassandra.db.ClusteringComparator; +import org.apache.cassandra.db.ClusteringPrefix; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.db.marshal.*; +import org.apache.cassandra.dht.ByteOrderedPartitioner; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.LocalPartitioner; +import org.apache.cassandra.dht.Murmur3Partitioner; +import org.apache.cassandra.dht.RandomPartitioner; +import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.TimeUUID; +import org.apache.cassandra.utils.bytecomparable.ByteComparable.Version; + +import static org.apache.cassandra.utils.bytecomparable.ByteSourceComparisonTest.decomposeForTuple; +import static org.junit.Assert.assertEquals; + +/** + * Tests that the result of forward + backward ByteSource translation is the same as the original. + */ +public class ByteSourceConversionTest extends ByteSourceTestBase +{ + private final static Logger logger = LoggerFactory.getLogger(ByteSourceConversionTest.class); + public static final Version VERSION = Version.OSS42; + + @Rule + public final ExpectedException expectedException = ExpectedException.none(); + + @Test + public void testStringsAscii() + { + testType(AsciiType.instance, Arrays.stream(testStrings) + .filter(s -> s.equals(new String(s.getBytes(StandardCharsets.US_ASCII), + StandardCharsets.US_ASCII))) + .toArray()); + } + + @Test + public void testStringsUTF8() + { + testType(UTF8Type.instance, testStrings); + testDirect(x -> ByteSource.of(x, VERSION), ByteSourceInverse::getString, testStrings); + } + + @Test + public void testBooleans() + { + testType(BooleanType.instance, testBools); + } + + @Test + public void testInts() + { + testType(Int32Type.instance, testInts); + testDirect(ByteSource::of, ByteSourceInverse::getSignedInt, testInts); + } + + @Test + public void randomTestInts() + { + Random rand = new Random(); + for (int i=0; i<10000; ++i) + { + int i1 = rand.nextInt(); + assertConvertsSame(Int32Type.instance, i1); + } + + } + + @Test + public void testLongs() + { + testType(LongType.instance, testLongs); + testDirect(ByteSource::of, ByteSourceInverse::getSignedLong, testLongs); + } + + @Test + public void testShorts() + { + testType(ShortType.instance, testShorts); + } + + @Test + public void testBytes() + { + testType(ByteType.instance, testBytes); + } + + @Test + public void testDoubles() + { + testType(DoubleType.instance, testDoubles); + } + + @Test + public void testFloats() + { + testType(FloatType.instance, testFloats); + } + + @Test + public void testBigInts() + { + testType(IntegerType.instance, testBigInts); + } + + @Test + public void testBigDecimals() + { + testTypeBuffers(DecimalType.instance, testBigDecimals); + } + + @Test + public void testUUIDs() + { + testType(UUIDType.instance, testUUIDs); + } + + @Test + public void testTimeUUIDs() + { + testType(TimeUUIDType.instance, Arrays.stream(testUUIDs) + .filter(x -> x == null || x.version() == 1) + .map(x -> x != null ? TimeUUID.fromUuid(x) : null) + .toArray()); + } + + @Test + public void testLexicalUUIDs() + { + testType(LexicalUUIDType.instance, testUUIDs); + } + + @Test + public void testSimpleDate() + { + testType(SimpleDateType.instance, Arrays.stream(testInts).filter(x -> x != null).toArray()); + } + + @Test + public void testTimeType() + { + testType(TimeType.instance, Arrays.stream(testLongs).filter(x -> x != null && x >= 0 && x <= 24L * 60 * 60 * 1000 * 1000 * 1000).toArray()); + } + + @SuppressWarnings("deprecation") + @Test + public void testDateType() + { + testType(DateType.instance, testDates); + } + + @Test + public void testTimestampType() + { + testType(TimestampType.instance, testDates); + } + + @Test + public void testBytesType() + { + List values = new ArrayList<>(); + for (int i = 0; i < testValues.length; ++i) + for (Object o : testValues[i]) + values.add(testTypes[i].decompose(o)); + + testType(BytesType.instance, values); + } + + @Test + public void testInetAddressType() throws UnknownHostException + { + testType(InetAddressType.instance, testInets); + } + + @Test + public void testEmptyType() + { + testType(EmptyType.instance, new Void[] { null }); + } + + @Test + public void testPatitionerDefinedOrder() + { + List values = new ArrayList<>(); + for (int i = 0; i < testValues.length; ++i) + for (Object o : testValues[i]) + values.add(testTypes[i].decompose(o)); + + testBuffers(new PartitionerDefinedOrder(Murmur3Partitioner.instance), values); + testBuffers(new PartitionerDefinedOrder(RandomPartitioner.instance), values); + testBuffers(new PartitionerDefinedOrder(ByteOrderedPartitioner.instance), values); + } + + @Test + public void testPatitionerOrder() + { + List values = new ArrayList<>(); + for (int i = 0; i < testValues.length; ++i) + for (Object o : testValues[i]) + values.add(testTypes[i].decompose(o)); + + testDecoratedKeys(Murmur3Partitioner.instance, values); + testDecoratedKeys(RandomPartitioner.instance, values); + testDecoratedKeys(ByteOrderedPartitioner.instance, values); + } + + @Test + public void testLocalPatitionerOrder() + { + for (int i = 0; i < testValues.length; ++i) + { + final AbstractType testType = testTypes[i]; + testDecoratedKeys(new LocalPartitioner(testType), Lists.transform(Arrays.asList(testValues[i]), + v -> testType.decompose(v))); + } + } + + interface PairTester + { + void test(AbstractType t1, AbstractType t2, Object o1, Object o2); + } + + void testCombinationSampling(Random rand, PairTester tester) + { + for (int i=0;i (ByteBuffer) v); + } + } + + @Test + public void testEmptyClustering() + { + ValueAccessor accessor = ByteBufferAccessor.instance; + ClusteringComparator comp = new ClusteringComparator(); + for (ClusteringPrefix.Kind kind : ClusteringPrefix.Kind.values()) + { + if (kind.isBoundary()) + continue; + + ClusteringPrefix empty = ByteSourceComparisonTest.makeBound(kind); + ClusteringPrefix converted = getClusteringPrefix(accessor, kind, comp, comp.asByteComparable(empty)); + assertEquals(empty, converted); + } + } + + void assertClusteringPairConvertsSame(AbstractType t1, AbstractType t2, Object o1, Object o2) + { + for (ValueAccessor accessor : ValueAccessors.ACCESSORS) + assertClusteringPairConvertsSame(accessor, t1, t2, o1, o2, AbstractType::decompose); + } + + void assertClusteringPairConvertsSame(ValueAccessor accessor, + AbstractType t1, AbstractType t2, + Object o1, Object o2, + BiFunction decompose) + { + boolean checkEquals = t1 != DecimalType.instance && t2 != DecimalType.instance; + for (ClusteringPrefix.Kind k1 : ClusteringPrefix.Kind.values()) + { + ClusteringComparator comp = new ClusteringComparator(t1, t2); + V[] b = accessor.createArray(2); + b[0] = accessor.valueOf(decompose.apply(t1, o1)); + b[1] = accessor.valueOf(decompose.apply(t2, o2)); + ClusteringPrefix c = ByteSourceComparisonTest.makeBound(accessor.factory(), k1, b); + final ByteComparable bsc = comp.asByteComparable(c); + logger.info("Clustering {} bytesource {}", c.clusteringString(comp.subtypes()), bsc.byteComparableAsString(VERSION)); + ClusteringPrefix converted = getClusteringPrefix(accessor, k1, comp, bsc); + assertEquals(String.format("Failed compare(%s, converted %s ByteSource %s) == 0\ntype %s", + safeStr(c.clusteringString(comp.subtypes())), + safeStr(converted.clusteringString(comp.subtypes())), + bsc.byteComparableAsString(VERSION), + comp), + 0, comp.compare(c, converted)); + if (checkEquals) + assertEquals(String.format("Failed equals %s, got %s ByteSource %s\ntype %s", + safeStr(c.clusteringString(comp.subtypes())), + safeStr(converted.clusteringString(comp.subtypes())), + bsc.byteComparableAsString(VERSION), + comp), + c, converted); + + ClusteringComparator compR = new ClusteringComparator(ReversedType.getInstance(t1), ReversedType.getInstance(t2)); + final ByteComparable bsrc = compR.asByteComparable(c); + converted = getClusteringPrefix(accessor, k1, compR, bsrc); + assertEquals(String.format("Failed reverse compare(%s, converted %s ByteSource %s) == 0\ntype %s", + safeStr(c.clusteringString(compR.subtypes())), + safeStr(converted.clusteringString(compR.subtypes())), + bsrc.byteComparableAsString(VERSION), + compR), + 0, compR.compare(c, converted)); + if (checkEquals) + assertEquals(String.format("Failed reverse equals %s, got %s ByteSource %s\ntype %s", + safeStr(c.clusteringString(compR.subtypes())), + safeStr(converted.clusteringString(compR.subtypes())), + bsrc.byteComparableAsString(VERSION), + compR), + c, converted); + } + } + + private static ClusteringPrefix getClusteringPrefix(ValueAccessor accessor, + ClusteringPrefix.Kind k1, + ClusteringComparator comp, + ByteComparable bsc) + { + switch (k1) + { + case STATIC_CLUSTERING: + case CLUSTERING: + return comp.clusteringFromByteComparable(accessor, bsc); + case EXCL_END_BOUND: + case INCL_END_BOUND: + return comp.boundFromByteComparable(accessor, bsc, true); + case INCL_START_BOUND: + case EXCL_START_BOUND: + return comp.boundFromByteComparable(accessor, bsc, false); + case EXCL_END_INCL_START_BOUNDARY: + case INCL_END_EXCL_START_BOUNDARY: + return comp.boundaryFromByteComparable(accessor, bsc); + default: + throw new AssertionError(); + } + } + + private static ByteSource.Peekable source(ByteComparable bsc) + { + if (bsc == null) + return null; + return ByteSource.peekable(bsc.asComparableBytes(VERSION)); + } + + @Test + public void testTupleType() + { + Random rand = ThreadLocalRandom.current(); + testCombinationSampling(rand, this::assertTupleConvertsSame); + } + + @Test + public void testTupleTypeNonFull() + { + TupleType tt = new TupleType(ImmutableList.of(UTF8Type.instance, Int32Type.instance)); + List tests = ImmutableList.of + ( + TupleType.buildValue(ByteBufferAccessor.instance, + decomposeAndRandomPad(UTF8Type.instance, ""), + decomposeAndRandomPad(Int32Type.instance, 0)), + // Note: a decomposed null (e.g. decomposeAndRandomPad(Int32Type.instance, null)) should not reach a tuple + TupleType.buildValue(ByteBufferAccessor.instance, + decomposeAndRandomPad(UTF8Type.instance, ""), + null), + TupleType.buildValue(ByteBufferAccessor.instance, + null, + decomposeAndRandomPad(Int32Type.instance, 0)), + TupleType.buildValue(ByteBufferAccessor.instance, decomposeAndRandomPad(UTF8Type.instance, "")), + TupleType.buildValue(ByteBufferAccessor.instance, (ByteBuffer) null), + TupleType.buildValue(ByteBufferAccessor.instance) + ); + testBuffers(tt, tests); + } + + void assertTupleConvertsSame(AbstractType t1, AbstractType t2, Object o1, Object o2) + { + TupleType tt = new TupleType(ImmutableList.of(t1, t2)); + ByteBuffer b1 = TupleType.buildValue(ByteBufferAccessor.instance, + decomposeForTuple(t1, o1), + decomposeForTuple(t2, o2)); + assertConvertsSameBuffers(tt, b1); + } + + @Test + public void testCompositeType() + { + Random rand = new Random(0); + testCombinationSampling(rand, this::assertCompositeConvertsSame); + } + + @Test + public void testCompositeTypeNonFull() + { + CompositeType tt = CompositeType.getInstance(UTF8Type.instance, Int32Type.instance); + List tests = ImmutableList.of + ( + CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(UTF8Type.instance, ""), decomposeAndRandomPad(Int32Type.instance, 0)), + CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(UTF8Type.instance, ""), decomposeAndRandomPad(Int32Type.instance, null)), + CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(UTF8Type.instance, "")), + CompositeType.build(ByteBufferAccessor.instance), + CompositeType.build(ByteBufferAccessor.instance, true, decomposeAndRandomPad(UTF8Type.instance, "")), + CompositeType.build(ByteBufferAccessor.instance,true) + ); + for (ByteBuffer b : tests) + tt.validate(b); + testBuffers(tt, tests); + } + + void assertCompositeConvertsSame(AbstractType t1, AbstractType t2, Object o1, Object o2) + { + CompositeType tt = CompositeType.getInstance(t1, t2); + ByteBuffer b1 = CompositeType.build(ByteBufferAccessor.instance, decomposeAndRandomPad(t1, o1), decomposeAndRandomPad(t2, o2)); + assertConvertsSameBuffers(tt, b1); + } + + @Test + public void testDynamicComposite() + { + DynamicCompositeType tt = DynamicCompositeType.getInstance(DynamicCompositeTypeTest.aliases); + UUID[] uuids = DynamicCompositeTypeTest.uuids; + List tests = ImmutableList.of + ( + DynamicCompositeTypeTest.createDynamicCompositeKey("test1", null, -1, false, true), + DynamicCompositeTypeTest.createDynamicCompositeKey("test1", uuids[0], 24, false, true), + DynamicCompositeTypeTest.createDynamicCompositeKey("test1", uuids[0], 42, false, true), + DynamicCompositeTypeTest.createDynamicCompositeKey("test2", uuids[0], -1, false, true), + DynamicCompositeTypeTest.createDynamicCompositeKey("test2", uuids[1], 42, false, true) + ); + for (ByteBuffer b : tests) + tt.validate(b); + testBuffers(tt, tests); + } + + @Test + public void testListTypeString() + { + testCollection(ListType.getInstance(UTF8Type.instance, true), testStrings, () -> new ArrayList<>(), new Random()); + } + + @Test + public void testListTypeLong() + { + testCollection(ListType.getInstance(LongType.instance, true), testLongs, () -> new ArrayList<>(), new Random()); + } + + @Test + public void testSetTypeString() + { + testCollection(SetType.getInstance(UTF8Type.instance, true), testStrings, () -> new HashSet<>(), new Random()); + } + + @Test + public void testSetTypeLong() + { + testCollection(SetType.getInstance(LongType.instance, true), testLongs, () -> new HashSet<>(), new Random()); + } + + > void testCollection(CollectionType tt, T[] values, Supplier gen, Random rand) + { + int cnt = 0; + List tests = new ArrayList<>(); + tests.add(gen.get()); + for (int c = 1; c <= 3; ++c) + for (int j = 0; j < 5; ++j) + { + CT l = gen.get(); + for (int i = 0; i < c; ++i) + { + T value = values[cnt++ % values.length]; + if (value != null) + l.add(value); + } + + tests.add(l); + } + testType(tt, tests); + } + + @Test + public void testMapTypeStringLong() + { + testMap(MapType.getInstance(UTF8Type.instance, LongType.instance, true), testStrings, testLongs, () -> new HashMap<>(), new Random()); + } + + @Test + public void testMapTypeStringLongTree() + { + testMap(MapType.getInstance(UTF8Type.instance, LongType.instance, true), testStrings, testLongs, () -> new TreeMap<>(), new Random()); + } + + > void testMap(MapType tt, K[] keys, V[] values, Supplier gen, Random rand) + { + List tests = new ArrayList<>(); + tests.add(gen.get()); + for (int c = 1; c <= 3; ++c) + for (int j = 0; j < 5; ++j) + { + M l = gen.get(); + for (int i = 0; i < c; ++i) + { + V value = values[rand.nextInt(values.length)]; + if (value != null) + l.put(keys[rand.nextInt(keys.length)], value); + } + + tests.add(l); + } + testType(tt, tests); + } + + /* + * Convert type to a comparable. + */ + private ByteComparable typeToComparable(AbstractType type, ByteBuffer value) + { + return new ByteComparable() + { + @Override + public ByteSource asComparableBytes(Version v) + { + return type.asComparableBytes(value, v); + } + + @Override + public String toString() + { + return type.getString(value); + } + }; + } + + public void testType(AbstractType type, Object[] values) + { + testType(type, Iterables.transform(Arrays.asList(values), x -> (T) x)); + } + + public void testType(AbstractType type, Iterable values) + { + for (T i : values) { + ByteBuffer b = decomposeAndRandomPad(type, i); + logger.info("Value {} ({}) bytes {} ByteSource {}", + safeStr(i), + safeStr(type.getSerializer().toCQLLiteral(b)), + safeStr(ByteBufferUtil.bytesToHex(b)), + typeToComparable(type, b).byteComparableAsString(VERSION)); + assertConvertsSame(type, i); + } + if (!type.isReversed()) + testType(ReversedType.getInstance(type), values); + } + + public void testTypeBuffers(AbstractType type, Object[] values) + { + testTypeBuffers(type, Lists.transform(Arrays.asList(values), x -> (T) x)); + } + + public void testTypeBuffers(AbstractType type, List values) + { + // Main difference with above is that we use type.compare instead of checking equals + testBuffers(type, Lists.transform(values, value -> decomposeAndRandomPad(type, value))); + + } + public void testBuffers(AbstractType type, List values) + { + try + { + for (ByteBuffer b : values) { + logger.info("Value {} bytes {} ByteSource {}", + safeStr(type.getSerializer().toCQLLiteral(b)), + safeStr(ByteBufferUtil.bytesToHex(b)), + typeToComparable(type, b).byteComparableAsString(VERSION)); + } + } + catch (UnsupportedOperationException e) + { + // Continue without listing values. + } + + for (ByteBuffer i : values) + assertConvertsSameBuffers(type, i); + } + + void assertConvertsSameBuffers(AbstractType type, ByteBuffer b1) + { + final ByteComparable bs1 = typeToComparable(type, b1); + + ByteBuffer actual = type.fromComparableBytes(source(bs1), VERSION); + assertEquals(String.format("Failed compare(%s, converted %s (bytesource %s))", + ByteBufferUtil.bytesToHex(b1), + ByteBufferUtil.bytesToHex(actual), + bs1.byteComparableAsString(VERSION)), + 0, + type.compare(b1, actual)); + } + + public void testDecoratedKeys(IPartitioner type, List values) + { + for (ByteBuffer i : values) + assertConvertsSameDecoratedKeys(type, i); + } + + void assertConvertsSameDecoratedKeys(IPartitioner type, ByteBuffer b1) + { + DecoratedKey k1 = type.decorateKey(b1); + DecoratedKey actual = BufferDecoratedKey.fromByteComparable(k1, VERSION, type); + + assertEquals(String.format("Failed compare(%s[%s bs %s], %s[%s bs %s])\npartitioner %s", + k1, + ByteBufferUtil.bytesToHex(b1), + k1.byteComparableAsString(VERSION), + actual, + ByteBufferUtil.bytesToHex(actual.getKey()), + actual.byteComparableAsString(VERSION), + type), + 0, + k1.compareTo(actual)); + assertEquals(String.format("Failed equals(%s[%s bs %s], %s[%s bs %s])\npartitioner %s", + k1, + ByteBufferUtil.bytesToHex(b1), + k1.byteComparableAsString(VERSION), + actual, + ByteBufferUtil.bytesToHex(actual.getKey()), + actual.byteComparableAsString(VERSION), + type), + k1, + actual); + } + + static Object safeStr(Object i) + { + if (i == null) + return null; + if (i instanceof ByteBuffer) + { + ByteBuffer buf = (ByteBuffer) i; + i = ByteBufferUtil.bytesToHex(buf); + } + String s = i.toString(); + if (s.length() > 100) + s = s.substring(0, 100) + "..."; + return s.replaceAll("\0", "<0>"); + } + + public void testDirect(Function convertor, Function inverse, T[] values) + { + for (T i : values) { + if (i == null) + continue; + + logger.info("Value {} ByteSource {}\n", + safeStr(i), + convertor.apply(i)); + + } + for (T i : values) + if (i != null) + assertConvertsSame(convertor, inverse, i); + } + + void assertConvertsSame(Function convertor, Function inverse, T v1) + { + ByteComparable b1 = v -> convertor.apply(v1); + T actual = inverse.apply(source(b1)); + assertEquals(String.format("ByteSource %s", b1.byteComparableAsString(VERSION)), v1, actual); + } + + void assertConvertsSame(AbstractType type, T v1) + { + ByteBuffer b1 = decomposeAndRandomPad(type, v1); + final ByteComparable bc1 = typeToComparable(type, b1); + ByteBuffer convertedBuffer = type.fromComparableBytes(source(bc1), VERSION); + T actual = type.compose(convertedBuffer); + + assertEquals(String.format("Failed equals %s(%s bs %s), got %s", + safeStr(v1), + ByteBufferUtil.bytesToHex(b1), + safeStr(bc1.byteComparableAsString(VERSION)), + safeStr(actual)), + v1, + actual); + } + + ByteBuffer decomposeAndRandomPad(AbstractType type, T v) + { + ByteBuffer b = type.decompose(v); + Random rand = new Random(0); + int padBefore = rand.nextInt(16); + int padAfter = rand.nextInt(16); + int paddedCapacity = b.remaining() + padBefore + padAfter; + ByteBuffer padded = allocateBuffer(paddedCapacity); + rand.ints(padBefore).forEach(x -> padded.put((byte) x)); + padded.put(b.duplicate()); + rand.ints(padAfter).forEach(x -> padded.put((byte) x)); + padded.clear().limit(padded.capacity() - padAfter).position(padBefore); + return padded; + } + + protected ByteBuffer allocateBuffer(int paddedCapacity) + { + return ByteBuffer.allocate(paddedCapacity); + } +} diff --git a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceInverseTest.java b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceInverseTest.java new file mode 100644 index 000000000000..391a8d383f31 --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceInverseTest.java @@ -0,0 +1,397 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.utils.bytecomparable; + +import org.apache.cassandra.db.marshal.*; +import org.apache.cassandra.utils.*; +import org.apache.cassandra.utils.memory.MemoryUtil; + +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import java.lang.reflect.Method; +import java.nio.ByteBuffer; +import java.nio.charset.StandardCharsets; +import java.util.*; +import java.util.function.Consumer; +import java.util.function.Function; +import java.util.function.IntConsumer; +import java.util.function.LongConsumer; +import java.util.stream.*; + +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; + +@RunWith(Parameterized.class) +public class ByteSourceInverseTest +{ + private static final String ALPHABET = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()"; + + @Parameterized.Parameters(name = "version={0}") + public static Iterable versions() + { + return ImmutableList.of(ByteComparable.Version.OSS42); + } + + private final ByteComparable.Version version; + + public ByteSourceInverseTest(ByteComparable.Version version) + { + this.version = version; + } + + @Test + public void testGetSignedInt() + { + IntConsumer intConsumer = initial -> + { + ByteSource byteSource = ByteSource.of(initial); + int decoded = ByteSourceInverse.getSignedInt(byteSource); + Assert.assertEquals(initial, decoded); + }; + + IntStream.of(Integer.MIN_VALUE, Integer.MIN_VALUE + 1, + -256, -255, -128, -127, -1, 0, 1, 127, 128, 255, 256, + Integer.MAX_VALUE - 1, Integer.MAX_VALUE) + .forEach(intConsumer); + new Random().ints(1000) + .forEach(intConsumer); + } + + @Test + public void testNextInt() + { + // The high and low 32 bits of this long differ only in the first and last bit (in the high 32 bits they are + // both 0s instead of 1s). The first bit difference will be negated by the bit flipping when writing down a + // fixed length signed number, so the only remaining difference will be in the last bit. + int hi = 0b0001_0010_0011_0100_0101_0110_0111_1000; + int lo = hi | 1 | 1 << 31; + long l1 = Integer.toUnsignedLong(hi) << 32 | Integer.toUnsignedLong(lo); + + ByteSource byteSource = ByteSource.of(l1); + int i1 = ByteSourceInverse.getSignedInt(byteSource); + int i2 = ByteSourceInverse.getSignedInt(byteSource); + Assert.assertEquals(i1 + 1, i2); + + try + { + ByteSourceInverse.getSignedInt(byteSource); + Assert.fail(); + } + catch (IllegalArgumentException e) + { + // Expected. + } + + byteSource = ByteSource.of(l1); + int iFirst = ByteSourceInverse.getSignedInt(byteSource); + Assert.assertEquals(i1, iFirst); + int iNext = ByteSourceInverse.getSignedInt(byteSource); + Assert.assertEquals(i2, iNext); + } + + @Test + public void testGetSignedLong() + { + LongConsumer longConsumer = initial -> + { + ByteSource byteSource = ByteSource.of(initial); + long decoded = ByteSourceInverse.getSignedLong(byteSource); + Assert.assertEquals(initial, decoded); + }; + + LongStream.of(Long.MIN_VALUE, Long.MIN_VALUE + 1, Integer.MIN_VALUE - 1L, + -256L, -255L, -128L, -127L, -1L, 0L, 1L, 127L, 128L, 255L, 256L, + Integer.MAX_VALUE + 1L, Long.MAX_VALUE - 1, Long.MAX_VALUE) + .forEach(longConsumer); + new Random().longs(1000) + .forEach(longConsumer); + } + + @Test + public void testGetSignedByte() + { + Consumer byteConsumer = boxedByte -> + { + byte initial = boxedByte; + ByteBuffer byteBuffer = ByteType.instance.decompose(initial); + ByteSource byteSource = ByteType.instance.asComparableBytes(byteBuffer, version); + byte decoded = ByteSourceInverse.getSignedByte(byteSource); + Assert.assertEquals(initial, decoded); + }; + + IntStream.range(Byte.MIN_VALUE, Byte.MAX_VALUE + 1) + .forEach(byteInteger -> byteConsumer.accept((byte) byteInteger)); + } + + @Test + public void testGetSignedShort() + { + Consumer shortConsumer = boxedShort -> + { + short initial = boxedShort; + ByteBuffer shortBuffer = ShortType.instance.decompose(initial); + ByteSource byteSource = ShortType.instance.asComparableBytes(shortBuffer, version); + short decoded = ByteSourceInverse.getSignedShort(byteSource); + Assert.assertEquals(initial, decoded); + }; + + IntStream.range(Short.MIN_VALUE, Short.MAX_VALUE + 1) + .forEach(shortInteger -> shortConsumer.accept((short) shortInteger)); + } + + @Test + public void testBadByteSourceForFixedLengthNumbers() + { + byte[] bytes = new byte[8]; + new Random().nextBytes(bytes); + for (Map.Entry entries : ImmutableMap.of("getSignedInt", 4, + "getSignedLong", 8, + "getSignedByte", 1, + "getSignedShort", 2).entrySet()) + { + String methodName = entries.getKey(); + int length = entries.getValue(); + try + { + Method fixedLengthNumberMethod = ByteSourceInverse.class.getMethod(methodName, ByteSource.class); + ArrayList sources = new ArrayList<>(); + sources.add(null); + sources.add(ByteSource.EMPTY); + for (int i = 0; i < length; ++i) + sources.add(ByteSource.fixedLength(bytes, 0, i)); + // Note: not testing invalid bytes (e.g. using the construction below) as they signify a programming + // error (throwing AssertionError) rather than something that could happen due to e.g. bad files. + // ByteSource.withTerminatorLegacy(257, ByteSource.fixedLength(bytes, 0, length - 1)); + for (ByteSource badSource : sources) + { + try + { + fixedLengthNumberMethod.invoke(ByteSourceInverse.class, badSource); + Assert.fail("Expected exception not thrown"); + } + catch (Throwable maybe) + { + maybe = Throwables.unwrapped(maybe); + final String message = "Unexpected throwable " + maybe + " with cause " + maybe.getCause(); + if (badSource == null) + Assert.assertTrue(message, + maybe instanceof NullPointerException); + else + Assert.assertTrue(message, + maybe instanceof IllegalArgumentException); + } + } + } + catch (NoSuchMethodException e) + { + Assert.fail("Expected ByteSourceInverse to have method called " + methodName + + " with a single parameter of type ByteSource"); + } + } + } + + @Test + public void testBadByteSourceForVariableLengthNumbers() + { + for (long value : Arrays.asList(0L, 1L << 6, 1L << 13, 1L << 20, 1L << 27, 1L << 34, 1L << 41, 1L << 48, 1L << 55)) + { + Assert.assertEquals(value, ByteSourceInverse.getVariableLengthInteger(ByteSource.variableLengthInteger(value))); + + ArrayList sources = new ArrayList<>(); + sources.add(null); + sources.add(ByteSource.EMPTY); + int length = ByteComparable.length(version -> ByteSource.variableLengthInteger(value), ByteComparable.Version.OSS42); + for (int i = 0; i < length; ++i) + sources.add(ByteSource.cut(ByteSource.variableLengthInteger(value), i)); + + for (ByteSource badSource : sources) + { + try + { + ByteSourceInverse.getVariableLengthInteger(badSource); + Assert.fail("Expected exception not thrown"); + } + catch (Throwable maybe) + { + maybe = Throwables.unwrapped(maybe); + final String message = "Unexpected throwable " + maybe + " with cause " + maybe.getCause(); + if (badSource == null) + Assert.assertTrue(message, + maybe instanceof NullPointerException); + else + Assert.assertTrue(message, + maybe instanceof IllegalArgumentException); + } + } + } + } + + @Test + public void testGetString() + { + Consumer stringConsumer = initial -> + { + ByteSource.Peekable byteSource = initial == null ? null : ByteSource.peekable(ByteSource.of(initial, version)); + String decoded = ByteSourceInverse.getString(byteSource); + Assert.assertEquals(initial, decoded); + }; + + Stream.of(null, "© 2018 DataStax", "", "\n", "\0", "\0\0", "\001", "0", "0\0", "00", "1") + .forEach(stringConsumer); + + Random prng = new Random(); + int stringLength = 10; + String random; + for (int i = 0; i < 1000; ++i) + { + random = newRandomAlphanumeric(prng, stringLength); + stringConsumer.accept(random); + } + } + + private static String newRandomAlphanumeric(Random prng, int length) + { + StringBuilder random = new StringBuilder(length); + for (int i = 0; i < length; ++i) + random.append(ALPHABET.charAt(prng.nextInt(ALPHABET.length()))); + return random.toString(); + } + + @Test + public void testGetByteBuffer() + { + for (Consumer byteArrayConsumer : Arrays.>asList(initialBytes -> + { + ByteSource.Peekable byteSource = ByteSource.peekable(ByteSource.of(ByteBuffer.wrap(initialBytes), version)); + byte[] decodedBytes = ByteSourceInverse.getUnescapedBytes(byteSource); + Assert.assertArrayEquals(initialBytes, decodedBytes); + }, + initialBytes -> + { + ByteSource.Peekable byteSource = ByteSource.peekable(ByteSource.of(initialBytes, version)); + byte[] decodedBytes = ByteSourceInverse.getUnescapedBytes(byteSource); + Assert.assertArrayEquals(initialBytes, decodedBytes); + }, + initialBytes -> + { + long address = MemoryUtil.allocate(initialBytes.length); + try + { + MemoryUtil.setBytes(address, initialBytes, 0, initialBytes.length); + ByteSource.Peekable byteSource = ByteSource.peekable(ByteSource.ofMemory(address, initialBytes.length, version)); + byte[] decodedBytes = ByteSourceInverse.getUnescapedBytes(byteSource); + Assert.assertArrayEquals(initialBytes, decodedBytes); + } + finally + { + MemoryUtil.free(address); + } + } + )) + { + for (byte[] tricky : Arrays.asList( + // ESCAPE - leading, in the middle, trailing + new byte[]{ 0, 2, 3, 4, 5 }, new byte[]{ 1, 2, 0, 4, 5 }, new byte[]{ 1, 2, 3, 4, 0 }, + // END_OF_STREAM/ESCAPED_0_DONE - leading, in the middle, trailing + new byte[]{ -1, 2, 3, 4, 5 }, new byte[]{ 1, 2, -1, 4, 5 }, new byte[]{ 1, 2, 3, 4, -1 }, + // ESCAPED_0_CONT - leading, in the middle, trailing + new byte[]{ -2, 2, 3, 4, 5 }, new byte[]{ 1, 2, -2, 4, 5 }, new byte[]{ 1, 2, 3, 4, -2 }, + // ESCAPE + ESCAPED_0_DONE - leading, in the middle, trailing + new byte[]{ 0, -1, 3, 4, 5 }, new byte[]{ 1, 0, -1, 4, 5 }, new byte[]{ 1, 2, 3, 0, -1 }, + // ESCAPE + ESCAPED_0_CONT + ESCAPED_0_DONE - leading, in the middle, trailing + new byte[]{ 0, -2, -1, 4, 5 }, new byte[]{ 1, 0, -2, -1, 5 }, new byte[]{ 1, 2, 0, -2, -1 })) + { + byteArrayConsumer.accept(tricky); + } + + byte[] bytes = new byte[1000]; + Random prng = new Random(); + for (int i = 0; i < 1000; ++i) + { + prng.nextBytes(bytes); + byteArrayConsumer.accept(bytes); + } + + int stringLength = 10; + String random; + for (int i = 0; i < 1000; ++i) + { + random = newRandomAlphanumeric(prng, stringLength); + byteArrayConsumer.accept(random.getBytes(StandardCharsets.UTF_8)); + } + } + } + + @Test + public void testReadBytes() + { + Map, Function> generatorPerType = new HashMap<>(); + List originalValues = new ArrayList<>(); + Random prng = new Random(); + + generatorPerType.put(String.class, s -> + { + String string = (String) s; + return ByteSource.of(string, version); + }); + for (int i = 0; i < 100; ++i) + originalValues.add(newRandomAlphanumeric(prng, 10)); + + generatorPerType.put(Integer.class, i -> + { + Integer integer = (Integer) i; + return ByteSource.of(integer); + }); + for (int i = 0; i < 100; ++i) + originalValues.add(prng.nextInt()); + + generatorPerType.put(Long.class, l -> + { + Long looong = (Long) l; + return ByteSource.of(looong); + }); + for (int i = 0; i < 100; ++i) + originalValues.add(prng.nextLong()); + + generatorPerType.put(UUID.class, u -> + { + UUID uuid = (UUID) u; + ByteBuffer uuidBuffer = UUIDType.instance.decompose(uuid); + return UUIDType.instance.asComparableBytes(uuidBuffer, version); + }); + for (int i = 0; i < 100; ++i) + originalValues.add(UUID.randomUUID()); + + for (Object value : originalValues) + { + Class type = value.getClass(); + Function generator = generatorPerType.get(type); + ByteSource originalSource = generator.apply(value); + ByteSource originalSourceCopy = generator.apply(value); + byte[] bytes = ByteSourceInverse.readBytes(originalSource); + // The best way to test the read bytes seems to be to assert that just directly using them as a + // ByteSource (using ByteSource.fixedLength(byte[])) they compare as equal to another ByteSource obtained + // from the same original value. + int compare = ByteComparable.compare(v -> originalSourceCopy, v -> ByteSource.fixedLength(bytes), version); + Assert.assertEquals(0, compare); + } + } +} diff --git a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceSequenceTest.java b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceSequenceTest.java new file mode 100644 index 000000000000..aa7843bba2f4 --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceSequenceTest.java @@ -0,0 +1,784 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.utils.bytecomparable; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.HashMap; +import java.util.Map; +import java.util.Random; +import java.util.TreeMap; +import java.util.function.Function; + +import com.google.common.collect.ImmutableList; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import org.apache.cassandra.db.BufferClusteringBound; +import org.apache.cassandra.db.BufferDecoratedKey; +import org.apache.cassandra.db.CachedHashDecoratedKey; +import org.apache.cassandra.db.ClusteringComparator; +import org.apache.cassandra.db.ClusteringPrefix; +import org.apache.cassandra.db.marshal.*; +import org.apache.cassandra.dht.IPartitioner; +import org.apache.cassandra.dht.LocalPartitioner; +import org.apache.cassandra.utils.TimeUUID; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; +import static org.junit.Assert.assertNull; + +@RunWith(Parameterized.class) +public class ByteSourceSequenceTest +{ + + private static final String ALPHABET = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890!@#$%^&*()"; + + @Parameterized.Parameters(name = "version={0}") + public static Iterable versions() + { + return ImmutableList.of(ByteComparable.Version.OSS42); + } + + private final ByteComparable.Version version; + + public ByteSourceSequenceTest(ByteComparable.Version version) + { + this.version = version; + } + + @Test + public void testNullsSequence() + { + ByteSource.Peekable comparableBytes = ByteSource.peekable(ByteSource.withTerminator( + ByteSource.TERMINATOR, + null, null, null + )); + expectNextComponentNull(comparableBytes); + expectNextComponentNull(comparableBytes); + expectNextComponentNull(comparableBytes); + assertEquals(ByteSource.TERMINATOR, comparableBytes.next()); + } + + @Test + public void testNullsAndUnknownLengthsSequence() + { + ByteSource.Peekable comparableBytes = ByteSource.peekable(ByteSource.withTerminator( + ByteSource.TERMINATOR, + null, ByteSource.of("b", version), ByteSource.of("c", version) + )); + expectNextComponentNull(comparableBytes); + expectNextComponentValue(comparableBytes, ByteSourceInverse::getString, "b"); + expectNextComponentValue(comparableBytes, ByteSourceInverse::getString, "c"); + assertEquals(ByteSource.TERMINATOR, comparableBytes.next()); + + comparableBytes = ByteSource.peekable(ByteSource.withTerminator( + ByteSource.TERMINATOR, + ByteSource.of("a", version), null, ByteSource.of("c", version) + )); + expectNextComponentValue(comparableBytes, ByteSourceInverse::getString, "a"); + expectNextComponentNull(comparableBytes); + expectNextComponentValue(comparableBytes, ByteSourceInverse::getString, "c"); + assertEquals(ByteSource.TERMINATOR, comparableBytes.next()); + + comparableBytes = ByteSource.peekable(ByteSource.withTerminator( + ByteSource.TERMINATOR, + ByteSource.of("a", version), ByteSource.of("b", version), null + )); + expectNextComponentValue(comparableBytes, ByteSourceInverse::getString, "a"); + expectNextComponentValue(comparableBytes, ByteSourceInverse::getString, "b"); + expectNextComponentNull(comparableBytes); + assertEquals(ByteSource.TERMINATOR, comparableBytes.next()); + + comparableBytes = ByteSource.peekable(ByteSource.withTerminator( + ByteSource.TERMINATOR, + ByteSource.of("a", version), null, null + )); + expectNextComponentValue(comparableBytes, ByteSourceInverse::getString, "a"); + expectNextComponentNull(comparableBytes); + expectNextComponentNull(comparableBytes); + assertEquals(ByteSource.TERMINATOR, comparableBytes.next()); + + comparableBytes = ByteSource.peekable(ByteSource.withTerminator( + ByteSource.TERMINATOR, + null, null, ByteSource.of("c", version) + )); + expectNextComponentNull(comparableBytes); + expectNextComponentNull(comparableBytes); + expectNextComponentValue(comparableBytes, ByteSourceInverse::getString, "c"); + assertEquals(ByteSource.TERMINATOR, comparableBytes.next()); + } + + private static void expectNextComponentNull(ByteSource.Peekable comparableBytes) + { + // We expect null-signifying separator, followed by a null ByteSource component + ByteSource.Peekable next = ByteSourceInverse.nextComponentSource(comparableBytes); + assertNull(next); + } + + private static void expectNextComponentValue(ByteSource.Peekable comparableBytes, + Function decoder, + T expected) + { + // We expect a regular separator, followed by a ByteSource component corresponding to the expected value + ByteSource.Peekable next = ByteSourceInverse.nextComponentSource(comparableBytes); + assertNotNull(next); + T decoded = decoder.apply(next); + assertEquals(expected, decoded); + } + + @Test + public void testNullsAndKnownLengthsSequence() + { + int intValue = 42; + BigInteger varintValue = BigInteger.valueOf(2018L); + ByteSource.Peekable comparableBytes = ByteSource.peekable(ByteSource.withTerminator( + ByteSource.TERMINATOR, + null, ByteSource.of(intValue), varintToByteSource(varintValue) + )); + expectNextComponentNull(comparableBytes); + expectNextComponentValue(comparableBytes, ByteSourceInverse::getSignedInt, intValue); + expectNextComponentValue(comparableBytes, VARINT, varintValue); + assertEquals(ByteSource.TERMINATOR, comparableBytes.next()); + + comparableBytes = ByteSource.peekable(ByteSource.withTerminator( + ByteSource.TERMINATOR, + ByteSource.of(intValue), null, varintToByteSource(varintValue) + )); + expectNextComponentValue(comparableBytes, ByteSourceInverse::getSignedInt, intValue); + expectNextComponentNull(comparableBytes); + expectNextComponentValue(comparableBytes, VARINT, varintValue); + assertEquals(ByteSource.TERMINATOR, comparableBytes.next()); + + comparableBytes = ByteSource.peekable(ByteSource.withTerminator( + ByteSource.TERMINATOR, + ByteSource.of(intValue), varintToByteSource(varintValue), null + )); + expectNextComponentValue(comparableBytes, ByteSourceInverse::getSignedInt, intValue); + expectNextComponentValue(comparableBytes, VARINT, varintValue); + expectNextComponentNull(comparableBytes); + assertEquals(ByteSource.TERMINATOR, comparableBytes.next()); + + comparableBytes = ByteSource.peekable(ByteSource.withTerminator( + ByteSource.TERMINATOR, + null, null, varintToByteSource(varintValue) + )); + expectNextComponentNull(comparableBytes); + expectNextComponentNull(comparableBytes); + expectNextComponentValue(comparableBytes, VARINT, varintValue); + assertEquals(ByteSource.TERMINATOR, comparableBytes.next()); + + comparableBytes = ByteSource.peekable(ByteSource.withTerminator( + ByteSource.TERMINATOR, + null, varintToByteSource(varintValue), null + )); + expectNextComponentNull(comparableBytes); + expectNextComponentValue(comparableBytes, VARINT, varintValue); + expectNextComponentNull(comparableBytes); + assertEquals(ByteSource.TERMINATOR, comparableBytes.next()); + + comparableBytes = ByteSource.peekable(ByteSource.withTerminator( + ByteSource.TERMINATOR, + varintToByteSource(varintValue), null, null + )); + expectNextComponentValue(comparableBytes, VARINT, varintValue); + expectNextComponentNull(comparableBytes); + expectNextComponentNull(comparableBytes); + assertEquals(ByteSource.TERMINATOR, comparableBytes.next()); + + Boolean boolValue = new Random().nextBoolean(); + ByteSource boolSource = BooleanType.instance.asComparableBytes(BooleanType.instance.decompose(boolValue), version); + comparableBytes = ByteSource.peekable(ByteSource.withTerminator( + ByteSource.TERMINATOR, + varintToByteSource(varintValue), boolSource, null + )); + expectNextComponentValue(comparableBytes, VARINT, varintValue); + expectNextComponentValue(comparableBytes, BooleanType.instance, boolValue); + expectNextComponentNull(comparableBytes); + assertEquals(ByteSource.TERMINATOR, comparableBytes.next()); + + boolSource = BooleanType.instance.asComparableBytes(BooleanType.instance.decompose(boolValue), version); + comparableBytes = ByteSource.peekable(ByteSource.withTerminator( + ByteSource.TERMINATOR, + varintToByteSource(varintValue), null, boolSource + )); + expectNextComponentValue(comparableBytes, VARINT, varintValue); + expectNextComponentNull(comparableBytes); + expectNextComponentValue(comparableBytes, BooleanType.instance, boolValue); + assertEquals(ByteSource.TERMINATOR, comparableBytes.next()); + } + + @Test + public void testOptionalSignedFixedLengthTypesSequence() + { + Random prng = new Random(); + String randomString = newRandomAlphanumeric(prng, 10); + byte randomByte = (byte) prng.nextInt(); + short randomShort = (short) prng.nextInt(); + int randomInt = prng.nextInt(); + long randomLong = prng.nextLong(); + BigInteger randomVarint = BigInteger.probablePrime(80, prng); + + Map, ByteBuffer> valuesByType = new HashMap, ByteBuffer>() + {{ + put(ByteType.instance, ByteType.instance.decompose(randomByte)); + put(ShortType.instance, ShortType.instance.decompose(randomShort)); + put(SimpleDateType.instance, SimpleDateType.instance.decompose(randomInt)); + put(TimeType.instance, TimeType.instance.decompose(randomLong)); + }}; + + for (Map.Entry, ByteBuffer> entry : valuesByType.entrySet()) + { + AbstractType type = entry.getKey(); + ByteBuffer value = entry.getValue(); + + ByteSource byteSource = type.asComparableBytes(value, version); + ByteSource.Peekable sequence = ByteSource.peekable(ByteSource.withTerminator( + ByteSource.TERMINATOR, + ByteSource.of(randomString, version), byteSource, varintToByteSource(randomVarint) + )); + expectNextComponentValue(sequence, ByteSourceInverse::getString, randomString); + expectNextComponentValue(sequence, type, value); + expectNextComponentValue(sequence, VARINT, randomVarint); + assertEquals(ByteSource.TERMINATOR, sequence.next()); + + byteSource = type.asComparableBytes(type.decompose(null), version); + sequence = ByteSource.peekable(ByteSource.withTerminator( + ByteSource.TERMINATOR, + ByteSource.of(randomString, version), byteSource, varintToByteSource(randomVarint) + )); + expectNextComponentValue(sequence, ByteSourceInverse::getString, randomString); + expectNextComponentNull(sequence); + expectNextComponentValue(sequence, VARINT, randomVarint); + assertEquals(ByteSource.TERMINATOR, sequence.next()); + } + } + + private ByteSource varintToByteSource(BigInteger value) + { + ByteBuffer valueByteBuffer = VARINT.decompose(value); + return VARINT.asComparableBytes(valueByteBuffer, version); + } + + private static final UTF8Type UTF8 = UTF8Type.instance; + private static final DecimalType DECIMAL = DecimalType.instance; + private static final IntegerType VARINT = IntegerType.instance; + + // A regular comparator using the natural ordering for all types. + private static final ClusteringComparator COMP = new ClusteringComparator(Arrays.asList( + UTF8, + DECIMAL, + VARINT + )); + // A comparator that reverses the ordering for the first unknown length type + private static final ClusteringComparator COMP_REVERSED_UNKNOWN_LENGTH = new ClusteringComparator(Arrays.asList( + ReversedType.getInstance(UTF8), + DECIMAL, + VARINT + )); + // A comparator that reverses the ordering for the second unknown length type + private static final ClusteringComparator COMP_REVERSED_UNKNOWN_LENGTH_2 = new ClusteringComparator(Arrays.asList( + UTF8, + ReversedType.getInstance(DECIMAL), + VARINT + )); + // A comparator that reverses the ordering for the sole known/computable length type + private static final ClusteringComparator COMP_REVERSED_KNOWN_LENGTH = new ClusteringComparator(Arrays.asList( + UTF8, + DECIMAL, + ReversedType.getInstance(VARINT) + )); + // A comparator that reverses the ordering for all types + private static final ClusteringComparator COMP_ALL_REVERSED = new ClusteringComparator(Arrays.asList( + ReversedType.getInstance(UTF8), + ReversedType.getInstance(DECIMAL), + ReversedType.getInstance(VARINT) + )); + + @Test + public void testClusteringPrefixBoundNormalAndReversed() + { + String stringValue = "Lorem ipsum dolor sit amet"; + BigDecimal decimalValue = BigDecimal.valueOf(123456789, 20); + BigInteger varintValue = BigInteger.valueOf(2018L); + + // Create some non-null clustering key values that will be encoded and decoded to byte-ordered representation + // with different types of clustering comparators (and in other tests with different types of prefixes). + ByteBuffer[] clusteringKeyValues = new ByteBuffer[] { + UTF8.decompose(stringValue), + DECIMAL.decompose(decimalValue), + VARINT.decompose(varintValue) + }; + + for (ClusteringPrefix.Kind prefixKind : ClusteringPrefix.Kind.values()) + { + if (prefixKind.isBoundary()) + continue; + + ClusteringPrefix prefix = BufferClusteringBound.create(prefixKind, clusteringKeyValues); + // Use the regular comparator. + ByteSource.Peekable comparableBytes = ByteSource.peekable(COMP.asByteComparable(prefix).asComparableBytes(version)); + expectNextComponentValue(comparableBytes, UTF8, stringValue); + expectNextComponentValue(comparableBytes, DECIMAL, decimalValue); + expectNextComponentValue(comparableBytes, VARINT, varintValue); + + prefix = BufferClusteringBound.create(prefixKind, clusteringKeyValues); + // Use the comparator reversing the ordering for the first unknown length type. + comparableBytes = ByteSource.peekable(COMP_REVERSED_UNKNOWN_LENGTH.asByteComparable(prefix).asComparableBytes(version)); + expectNextComponentValue(comparableBytes, ReversedType.getInstance(UTF8), stringValue); + expectNextComponentValue(comparableBytes, DECIMAL, decimalValue); + expectNextComponentValue(comparableBytes, VARINT, varintValue); + + prefix = BufferClusteringBound.create(prefixKind, clusteringKeyValues); + // Use the comparator reversing the ordering for the second unknown length type. + comparableBytes = ByteSource.peekable(COMP_REVERSED_UNKNOWN_LENGTH_2.asByteComparable(prefix).asComparableBytes(version)); + expectNextComponentValue(comparableBytes, UTF8, stringValue); + expectNextComponentValue(comparableBytes, ReversedType.getInstance(DECIMAL), decimalValue); + expectNextComponentValue(comparableBytes, VARINT, varintValue); + + prefix = BufferClusteringBound.create(prefixKind, clusteringKeyValues); + // Use the comparator reversing the ordering for the known/computable length type. + comparableBytes = ByteSource.peekable(COMP_REVERSED_KNOWN_LENGTH.asByteComparable(prefix).asComparableBytes(version)); + expectNextComponentValue(comparableBytes, UTF8, stringValue); + expectNextComponentValue(comparableBytes, DECIMAL, decimalValue); + expectNextComponentValue(comparableBytes, ReversedType.getInstance(VARINT), varintValue); + + prefix = BufferClusteringBound.create(prefixKind, clusteringKeyValues); + // Use the all-reversing comparator. + comparableBytes = ByteSource.peekable(COMP_ALL_REVERSED.asByteComparable(prefix).asComparableBytes(version)); + expectNextComponentValue(comparableBytes, ReversedType.getInstance(UTF8), stringValue); + expectNextComponentValue(comparableBytes, ReversedType.getInstance(DECIMAL), decimalValue); + expectNextComponentValue(comparableBytes, ReversedType.getInstance(VARINT), varintValue); + } + } + + @Test + public void testClusteringPrefixBoundNulls() + { + String stringValue = "Lorem ipsum dolor sit amet"; + BigDecimal decimalValue = BigDecimal.valueOf(123456789, 20); + BigInteger varintValue = BigInteger.valueOf(2018L); + + // Create clustering key values where the component for an unknown length type is null. + ByteBuffer[] unknownLengthNull = new ByteBuffer[] { + UTF8.decompose(stringValue), + DECIMAL.decompose(null), + VARINT.decompose(varintValue) + }; + // Create clustering key values where the component for a known/computable length type is null. + ByteBuffer[] knownLengthNull = new ByteBuffer[] { + UTF8.decompose(stringValue), + DECIMAL.decompose(decimalValue), + VARINT.decompose(null) + }; + + for (ClusteringPrefix.Kind prefixKind : ClusteringPrefix.Kind.values()) + { + if (prefixKind.isBoundary()) + continue; + + // Test the decoding of a null component of a non-reversed unknown length type. + ClusteringPrefix prefix = BufferClusteringBound.create(prefixKind, unknownLengthNull); + ByteSource.Peekable comparableBytes = ByteSource.peekable(COMP.asByteComparable(prefix).asComparableBytes(version)); + expectNextComponentValue(comparableBytes, UTF8, stringValue); + expectNextComponentNull(comparableBytes); + expectNextComponentValue(comparableBytes, VARINT, varintValue); + // Test the decoding of a null component of a reversed unknown length type. + prefix = BufferClusteringBound.create(prefixKind, unknownLengthNull); + comparableBytes = ByteSource.peekable(COMP_REVERSED_UNKNOWN_LENGTH_2.asByteComparable(prefix).asComparableBytes(version)); + expectNextComponentValue(comparableBytes, UTF8, stringValue); + expectNextComponentNull(comparableBytes); + expectNextComponentValue(comparableBytes, VARINT, varintValue); + + // Test the decoding of a null component of a non-reversed known/computable length type. + prefix = BufferClusteringBound.create(prefixKind, knownLengthNull); + comparableBytes = ByteSource.peekable(COMP.asByteComparable(prefix).asComparableBytes(version)); + expectNextComponentValue(comparableBytes, UTF8, stringValue); + expectNextComponentValue(comparableBytes, DECIMAL, decimalValue); + expectNextComponentNull(comparableBytes); + // Test the decoding of a null component of a reversed known/computable length type. + prefix = BufferClusteringBound.create(prefixKind, knownLengthNull); + comparableBytes = ByteSource.peekable(COMP_REVERSED_KNOWN_LENGTH.asByteComparable(prefix).asComparableBytes(version)); + expectNextComponentValue(comparableBytes, UTF8, stringValue); + expectNextComponentValue(comparableBytes, DECIMAL, decimalValue); + expectNextComponentNull(comparableBytes); + } + } + + private void expectNextComponentValue(ByteSource.Peekable comparableBytes, + AbstractType type, + T expected) + { + // We expect a regular separator, followed by a ByteSource component corresponding to the expected value + ByteSource.Peekable next = ByteSourceInverse.nextComponentSource(comparableBytes); + T decoded = type.compose(type.fromComparableBytes(next, version)); + assertEquals(expected, decoded); + } + + private void expectNextComponentValue(ByteSource.Peekable comparableBytes, + AbstractType type, + ByteBuffer expected) + { + // We expect a regular separator, followed by a ByteSource component corresponding to the expected value + ByteSource.Peekable next = ByteSourceInverse.nextComponentSource(comparableBytes); + assertEquals(expected, type.fromComparableBytes(next, version)); + } + + @Test + public void testGetBoundFromPrefixTerminator() + { + String stringValue = "Lorem ipsum dolor sit amet"; + BigDecimal decimalValue = BigDecimal.valueOf(123456789, 20); + BigInteger varintValue = BigInteger.valueOf(2018L); + + ByteBuffer[] clusteringKeyValues = new ByteBuffer[] { + UTF8.decompose(stringValue), + DECIMAL.decompose(decimalValue), + VARINT.decompose(varintValue) + }; + ByteBuffer[] nullValueBeforeTerminator = new ByteBuffer[] { + UTF8.decompose(stringValue), + DECIMAL.decompose(decimalValue), + VARINT.decompose(null) + }; + + for (ClusteringPrefix.Kind prefixKind : ClusteringPrefix.Kind.values()) + { + // NOTE dimitar.dimitrov I assume there's a sensible explanation why does STATIC_CLUSTERING use a custom + // terminator that's not one of the common separator values, but I haven't spent enough time to get it. + if (prefixKind.isBoundary()) + continue; + + // Test that the read terminator value is exactly the encoded value of this prefix' bound. + ClusteringPrefix prefix = BufferClusteringBound.create(prefixKind, clusteringKeyValues); + ByteSource.Peekable comparableBytes = ByteSource.peekable(COMP.asByteComparable(prefix).asComparableBytes(version)); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + ByteSourceInverse.getString(comparableBytes); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + DECIMAL.fromComparableBytes(comparableBytes, version); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + VARINT.fromComparableBytes(comparableBytes, version); + // Expect the last separator (i.e. the terminator) to be the one specified by the prefix kind. + assertEquals(prefixKind.asByteComparableValue(version), comparableBytes.next()); + + // Test that the read terminator value is exactly the encoded value of this prefix' bound, when the + // terminator is preceded by a null value. + prefix = BufferClusteringBound.create(prefixKind, nullValueBeforeTerminator); + comparableBytes = ByteSource.peekable(COMP.asByteComparable(prefix).asComparableBytes(version)); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + ByteSourceInverse.getString(comparableBytes); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + DECIMAL.fromComparableBytes(comparableBytes, version); + // Expect null-signifying separator here. + assertEquals(ByteSource.NEXT_COMPONENT_EMPTY, comparableBytes.next()); + // No varint to read + // Expect the last separator (i.e. the terminator) to be the one specified by the prefix kind. + assertEquals(prefixKind.asByteComparableValue(version), comparableBytes.next()); + + // Test that the read terminator value is exactly the encoded value of this prefix' bound, when the + // terminator is preceded by a reversed null value. + prefix = BufferClusteringBound.create(prefixKind, nullValueBeforeTerminator); + // That's the comparator that will reverse the ordering of the type of the last value in the prefix (the + // one before the terminator). In other tests we're more interested in the fact that values of this type + // have known/computable length, which is why we've named it so... + comparableBytes = ByteSource.peekable(COMP_REVERSED_KNOWN_LENGTH.asByteComparable(prefix).asComparableBytes(version)); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + ByteSourceInverse.getString(comparableBytes); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + DECIMAL.fromComparableBytes(comparableBytes, version); + // Expect reversed null-signifying separator here. + assertEquals(ByteSource.NEXT_COMPONENT_EMPTY_REVERSED, comparableBytes.next()); + // No varint to read + // Expect the last separator (i.e. the terminator) to be the one specified by the prefix kind. + assertEquals(prefixKind.asByteComparableValue(version), comparableBytes.next()); + } + } + + @Test + public void testReversedTypesInClusteringKey() + { + String stringValue = "Lorem ipsum dolor sit amet"; + BigDecimal decimalValue = BigDecimal.valueOf(123456789, 20); + + AbstractType reversedStringType = ReversedType.getInstance(UTF8); + AbstractType reversedDecimalType = ReversedType.getInstance(DECIMAL); + + final ClusteringComparator comparator = new ClusteringComparator(Arrays.asList( + // unknown length type + UTF8, + // known length type + DECIMAL, + // reversed unknown length type + reversedStringType, + // reversed known length type + reversedDecimalType + )); + ByteBuffer[] clusteringKeyValues = new ByteBuffer[] { + UTF8.decompose(stringValue), + DECIMAL.decompose(decimalValue), + UTF8.decompose(stringValue), + DECIMAL.decompose(decimalValue) + }; + + final ClusteringComparator comparator2 = new ClusteringComparator(Arrays.asList( + // known length type + DECIMAL, + // unknown length type + UTF8, + // reversed known length type + reversedDecimalType, + // reversed unknown length type + reversedStringType + )); + ByteBuffer[] clusteringKeyValues2 = new ByteBuffer[] { + DECIMAL.decompose(decimalValue), + UTF8.decompose(stringValue), + DECIMAL.decompose(decimalValue), + UTF8.decompose(stringValue) + }; + + for (ClusteringPrefix.Kind prefixKind : ClusteringPrefix.Kind.values()) + { + if (prefixKind.isBoundary()) + continue; + + ClusteringPrefix prefix = BufferClusteringBound.create(prefixKind, clusteringKeyValues); + ByteSource.Peekable comparableBytes = ByteSource.peekable(comparator.asByteComparable(prefix).asComparableBytes(version)); + + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(UTF8, comparableBytes), stringValue); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(DECIMAL, comparableBytes), decimalValue); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(reversedStringType, comparableBytes), stringValue); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(reversedDecimalType, comparableBytes), decimalValue); + + assertEquals(prefixKind.asByteComparableValue(version), comparableBytes.next()); + assertEquals(ByteSource.END_OF_STREAM, comparableBytes.next()); + + ClusteringPrefix prefix2 = BufferClusteringBound.create(prefixKind, clusteringKeyValues2); + ByteSource.Peekable comparableBytes2 = ByteSource.peekable(comparator2.asByteComparable(prefix2).asComparableBytes(version)); + + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes2.next()); + assertEquals(getComponentValue(DECIMAL, comparableBytes2), decimalValue); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes2.next()); + assertEquals(getComponentValue(UTF8, comparableBytes2), stringValue); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes2.next()); + assertEquals(getComponentValue(reversedDecimalType, comparableBytes2), decimalValue); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes2.next()); + assertEquals(getComponentValue(reversedStringType, comparableBytes2), stringValue); + + assertEquals(prefixKind.asByteComparableValue(version), comparableBytes2.next()); + assertEquals(ByteSource.END_OF_STREAM, comparableBytes2.next()); + } + } + + private , E> E getComponentValue(T type, ByteSource.Peekable comparableBytes) + { + return type.compose(type.fromComparableBytes(comparableBytes, version)); + } + + @Test + public void testReadingNestedSequence_Simple() + { + String padding1 = "A string"; + String padding2 = "Another string"; + + BigInteger varint1 = BigInteger.valueOf(0b10000000); + BigInteger varint2 = BigInteger.valueOf(1 >> 30); + BigInteger varint3 = BigInteger.valueOf(0x10000000L); + BigInteger varint4 = BigInteger.valueOf(Long.MAX_VALUE); + + String string1 = "Testing byte sources"; + String string2 = "is neither easy nor fun;"; + String string3 = "But do it we must."; + String string4 = "— DataStax, 2018"; + + MapType varintStringMapType = MapType.getInstance(VARINT, UTF8, false); + Map varintStringMap = new TreeMap<>(); + varintStringMap.put(varint1, string1); + varintStringMap.put(varint2, string2); + varintStringMap.put(varint3, string3); + varintStringMap.put(varint4, string4); + + ByteSource sequence = ByteSource.withTerminator( + ByteSource.TERMINATOR, + ByteSource.of(padding1, version), + varintStringMapType.asComparableBytes(varintStringMapType.decompose(varintStringMap), version), + ByteSource.of(padding2, version) + ); + ByteSource.Peekable comparableBytes = ByteSource.peekable(sequence); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(UTF8, comparableBytes), padding1); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(varintStringMapType, comparableBytes), varintStringMap); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(UTF8, comparableBytes), padding2); + sequence = ByteSource.withTerminator( + ByteSource.TERMINATOR, + varintStringMapType.asComparableBytes(varintStringMapType.decompose(varintStringMap), version), + ByteSource.of(padding1, version), + ByteSource.of(padding2, version) + ); + comparableBytes = ByteSource.peekable(sequence); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(varintStringMapType, comparableBytes), varintStringMap); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(UTF8, comparableBytes), padding1); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(UTF8, comparableBytes), padding2); + sequence = ByteSource.withTerminator( + ByteSource.TERMINATOR, + ByteSource.of(padding1, version), + ByteSource.of(padding2, version), + varintStringMapType.asComparableBytes(varintStringMapType.decompose(varintStringMap), version) + ); + comparableBytes = ByteSource.peekable(sequence); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(UTF8, comparableBytes), padding1); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(UTF8, comparableBytes), padding2); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(varintStringMapType, comparableBytes), varintStringMap); + + MapType stringVarintMapType = MapType.getInstance(UTF8, VARINT, false); + Map stringVarintMap = new HashMap<>(); + stringVarintMap.put(string1, varint1); + stringVarintMap.put(string2, varint2); + stringVarintMap.put(string3, varint3); + stringVarintMap.put(string4, varint4); + + sequence = ByteSource.withTerminator( + ByteSource.TERMINATOR, + ByteSource.of(padding1, version), + stringVarintMapType.asComparableBytes(stringVarintMapType.decompose(stringVarintMap), version), + ByteSource.of(padding2, version) + ); + comparableBytes = ByteSource.peekable(sequence); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(UTF8, comparableBytes), padding1); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(stringVarintMapType, comparableBytes), stringVarintMap); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(UTF8, comparableBytes), padding2); + + MapType stringStringMapType = MapType.getInstance(UTF8, UTF8, false); + Map stringStringMap = new HashMap<>(); + stringStringMap.put(string1, string4); + stringStringMap.put(string2, string3); + stringStringMap.put(string3, string2); + stringStringMap.put(string4, string1); + + sequence = ByteSource.withTerminator( + ByteSource.TERMINATOR, + ByteSource.of(padding1, version), + stringStringMapType.asComparableBytes(stringStringMapType.decompose(stringStringMap), version), + ByteSource.of(padding2, version) + ); + comparableBytes = ByteSource.peekable(sequence); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(UTF8, comparableBytes), padding1); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(stringStringMapType, comparableBytes), stringStringMap); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(UTF8, comparableBytes), padding2); + + MapType varintVarintMapType = MapType.getInstance(VARINT, VARINT, false); + Map varintVarintMap = new HashMap<>(); + varintVarintMap.put(varint1, varint4); + varintVarintMap.put(varint2, varint3); + varintVarintMap.put(varint3, varint2); + varintVarintMap.put(varint4, varint1); + + sequence = ByteSource.withTerminator( + ByteSource.TERMINATOR, + ByteSource.of(padding1, version), + varintVarintMapType.asComparableBytes(varintVarintMapType.decompose(varintVarintMap), version), + ByteSource.of(padding2, version) + ); + comparableBytes = ByteSource.peekable(sequence); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(UTF8, comparableBytes), padding1); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(varintVarintMapType, comparableBytes), varintVarintMap); + assertEquals(ByteSource.NEXT_COMPONENT, comparableBytes.next()); + assertEquals(getComponentValue(UTF8, comparableBytes), padding2); + } + + @Test + public void testReadingNestedSequence_DecoratedKey() + { + Random prng = new Random(); + + MapType stringDecimalMapType = MapType.getInstance(UTF8, DECIMAL, false); + Map stringDecimalMap = new HashMap<>(); + for (int i = 0; i < 4; ++i) + stringDecimalMap.put(newRandomAlphanumeric(prng, 10), BigDecimal.valueOf(prng.nextDouble())); + ByteBuffer key = stringDecimalMapType.decompose(stringDecimalMap); + testDecodingKeyWithLocalPartitionerForType(key, stringDecimalMapType); + + MapType decimalStringMapType = MapType.getInstance(DECIMAL, UTF8, false); + Map decimalStringMap = new HashMap<>(); + for (int i = 0; i < 4; ++i) + decimalStringMap.put(BigDecimal.valueOf(prng.nextDouble()), newRandomAlphanumeric(prng, 10)); + key = decimalStringMapType.decompose(decimalStringMap); + testDecodingKeyWithLocalPartitionerForType(key, decimalStringMapType); + + if (version != ByteComparable.Version.LEGACY) + { + CompositeType stringDecimalCompType = CompositeType.getInstance(UTF8, DECIMAL); + key = stringDecimalCompType.decompose(newRandomAlphanumeric(prng, 10), BigDecimal.valueOf(prng.nextDouble())); + testDecodingKeyWithLocalPartitionerForType(key, stringDecimalCompType); + + CompositeType decimalStringCompType = CompositeType.getInstance(DECIMAL, UTF8); + key = decimalStringCompType.decompose(BigDecimal.valueOf(prng.nextDouble()), newRandomAlphanumeric(prng, 10)); + testDecodingKeyWithLocalPartitionerForType(key, decimalStringCompType); + + DynamicCompositeType dynamicCompType = DynamicCompositeType.getInstance(DynamicCompositeTypeTest.aliases); + key = DynamicCompositeTypeTest.createDynamicCompositeKey( + newRandomAlphanumeric(prng, 10), TimeUUID.Generator.nextTimeAsUUID(), 42, true, false); + testDecodingKeyWithLocalPartitionerForType(key, dynamicCompType); + + key = DynamicCompositeTypeTest.createDynamicCompositeKey( + newRandomAlphanumeric(prng, 10), TimeUUID.Generator.nextTimeAsUUID(), 42, true, true); + testDecodingKeyWithLocalPartitionerForType(key, dynamicCompType); + } + } + + private static String newRandomAlphanumeric(Random prng, int length) + { + StringBuilder random = new StringBuilder(length); + for (int i = 0; i < length; ++i) + random.append(ALPHABET.charAt(prng.nextInt(ALPHABET.length()))); + return random.toString(); + } + + private void testDecodingKeyWithLocalPartitionerForType(ByteBuffer key, AbstractType type) + { + IPartitioner partitioner = new LocalPartitioner(type); + CachedHashDecoratedKey initial = (CachedHashDecoratedKey) partitioner.decorateKey(key); + BufferDecoratedKey base = BufferDecoratedKey.fromByteComparable(initial, version, partitioner); + CachedHashDecoratedKey decoded = new CachedHashDecoratedKey(base.getToken(), base.getKey()); + Assert.assertEquals(initial, decoded); + } +} diff --git a/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceTestBase.java b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceTestBase.java new file mode 100644 index 000000000000..90463f63050e --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/bytecomparable/ByteSourceTestBase.java @@ -0,0 +1,513 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils.bytecomparable; + +import java.math.BigDecimal; +import java.math.BigInteger; +import java.net.InetAddress; +import java.net.UnknownHostException; +import java.time.Instant; +import java.util.ArrayList; +import java.util.Date; +import java.util.List; +import java.util.Set; +import java.util.TreeSet; +import java.util.UUID; + +import com.google.common.base.Throwables; + +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.BooleanType; +import org.apache.cassandra.db.marshal.DecimalType; +import org.apache.cassandra.db.marshal.DoubleType; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.IntegerType; +import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.utils.TimeUUID; + +public class ByteSourceTestBase +{ + String[] testStrings = new String[]{ "", "\0", "\0\0", "\001", "A\0\0B", "A\0B\0", "0", "0\0", "00", "1", "\377" }; + Integer[] testInts = new Integer[]{ null, + Integer.MIN_VALUE, + Integer.MIN_VALUE + 1, + -256, + -255, + -128, + -127, + -64, + -63, + -1, + 0, + 1, + 63, + 64, + 127, + 128, + 255, + 256, + Integer.MAX_VALUE - 1, + Integer.MAX_VALUE }; + Byte[] testBytes = new Byte[]{ -128, -127, -1, 0, 1, 127 }; + Short[] testShorts = new Short[]{ Short.MIN_VALUE, + Short.MIN_VALUE + 1, + -256, + -255, + -128, + -127, + -65, + -64, + -63, + -1, + 0, + 1, + 127, + 128, + 255, + 256, + Short.MAX_VALUE - 1, + Short.MAX_VALUE }; + Long[] testLongs = new Long[]{ null, + Long.MIN_VALUE, + Long.MIN_VALUE + 1, + Integer.MIN_VALUE - 1L, + -256L, + -255L, + -128L, + -127L, + -65L, + -64L, + -63L, + -1L, + 0L, + 1L, + Integer.MAX_VALUE + 1L, + Long.MAX_VALUE - 1, + Long.MAX_VALUE, + + (1L << 1) - 1, + (1L << 1), + (1L << 2) - 1, + (1L << 2), + (1L << 3) - 1, + (1L << 3), + (1L << 4) - 1, + (1L << 4), + (1L << 5) - 1, + (1L << 5), + (1L << 6) - 1, + (1L << 6), + (1L << 7) - 1, + (1L << 7), + (1L << 8) - 1, + (1L << 8), + (1L << 9) - 1, + (1L << 9), + (1L << 10) - 1, + (1L << 10), + (1L << 11) - 1, + (1L << 11), + (1L << 12) - 1, + (1L << 12), + (1L << 13) - 1, + (1L << 13), + (1L << 14) - 1, + (1L << 14), + (1L << 15) - 1, + (1L << 15), + (1L << 16) - 1, + (1L << 16), + (1L << 17) - 1, + (1L << 17), + (1L << 18) - 1, + (1L << 18), + (1L << 19) - 1, + (1L << 19), + (1L << 20) - 1, + (1L << 20), + (1L << 21) - 1, + (1L << 21), + (1L << 22) - 1, + (1L << 22), + (1L << 23) - 1, + (1L << 23), + (1L << 24) - 1, + (1L << 24), + (1L << 25) - 1, + (1L << 25), + (1L << 26) - 1, + (1L << 26), + (1L << 27) - 1, + (1L << 27), + (1L << 28) - 1, + (1L << 28), + (1L << 29) - 1, + (1L << 29), + (1L << 30) - 1, + (1L << 30), + (1L << 31) - 1, + (1L << 31), + (1L << 32) - 1, + (1L << 32), + (1L << 33) - 1, + (1L << 33), + (1L << 34) - 1, + (1L << 34), + (1L << 35) - 1, + (1L << 35), + (1L << 36) - 1, + (1L << 36), + (1L << 37) - 1, + (1L << 37), + (1L << 38) - 1, + (1L << 38), + (1L << 39) - 1, + (1L << 39), + (1L << 40) - 1, + (1L << 40), + (1L << 41) - 1, + (1L << 41), + (1L << 42) - 1, + (1L << 42), + (1L << 43) - 1, + (1L << 43), + (1L << 44) - 1, + (1L << 44), + (1L << 45) - 1, + (1L << 45), + (1L << 46) - 1, + (1L << 46), + (1L << 47) - 1, + (1L << 47), + (1L << 48) - 1, + (1L << 48), + (1L << 49) - 1, + (1L << 49), + (1L << 50) - 1, + (1L << 50), + (1L << 51) - 1, + (1L << 51), + (1L << 52) - 1, + (1L << 52), + (1L << 53) - 1, + (1L << 53), + (1L << 54) - 1, + (1L << 54), + (1L << 55) - 1, + (1L << 55), + (1L << 56) - 1, + (1L << 56), + (1L << 57) - 1, + (1L << 57), + (1L << 58) - 1, + (1L << 58), + (1L << 59) - 1, + (1L << 59), + (1L << 60) - 1, + (1L << 60), + (1L << 61) - 1, + (1L << 61), + (1L << 62) - 1, + (1L << 62), + (1L << 63) - 1, + + ~((1L << 1) - 1), + ~((1L << 1)), + ~((1L << 2) - 1), + ~((1L << 2)), + ~((1L << 3) - 1), + ~((1L << 3)), + ~((1L << 4) - 1), + ~((1L << 4)), + ~((1L << 5) - 1), + ~((1L << 5)), + ~((1L << 6) - 1), + ~((1L << 6)), + ~((1L << 7) - 1), + ~((1L << 7)), + ~((1L << 8) - 1), + ~((1L << 8)), + ~((1L << 9) - 1), + ~((1L << 9)), + ~((1L << 10) - 1), + ~((1L << 10)), + ~((1L << 11) - 1), + ~((1L << 11)), + ~((1L << 12) - 1), + ~((1L << 12)), + ~((1L << 13) - 1), + ~((1L << 13)), + ~((1L << 14) - 1), + ~((1L << 14)), + ~((1L << 15) - 1), + ~((1L << 15)), + ~((1L << 16) - 1), + ~((1L << 16)), + ~((1L << 17) - 1), + ~((1L << 17)), + ~((1L << 18) - 1), + ~((1L << 18)), + ~((1L << 19) - 1), + ~((1L << 19)), + ~((1L << 20) - 1), + ~((1L << 20)), + ~((1L << 21) - 1), + ~((1L << 21)), + ~((1L << 22) - 1), + ~((1L << 22)), + ~((1L << 23) - 1), + ~((1L << 23)), + ~((1L << 24) - 1), + ~((1L << 24)), + ~((1L << 25) - 1), + ~((1L << 25)), + ~((1L << 26) - 1), + ~((1L << 26)), + ~((1L << 27) - 1), + ~((1L << 27)), + ~((1L << 28) - 1), + ~((1L << 28)), + ~((1L << 29) - 1), + ~((1L << 29)), + ~((1L << 30) - 1), + ~((1L << 30)), + ~((1L << 31) - 1), + ~((1L << 31)), + ~((1L << 32) - 1), + ~((1L << 32)), + ~((1L << 33) - 1), + ~((1L << 33)), + ~((1L << 34) - 1), + ~((1L << 34)), + ~((1L << 35) - 1), + ~((1L << 35)), + ~((1L << 36) - 1), + ~((1L << 36)), + ~((1L << 37) - 1), + ~((1L << 37)), + ~((1L << 38) - 1), + ~((1L << 38)), + ~((1L << 39) - 1), + ~((1L << 39)), + ~((1L << 40) - 1), + ~((1L << 40)), + ~((1L << 41) - 1), + ~((1L << 41)), + ~((1L << 42) - 1), + ~((1L << 42)), + ~((1L << 43) - 1), + ~((1L << 43)), + ~((1L << 44) - 1), + ~((1L << 44)), + ~((1L << 45) - 1), + ~((1L << 45)), + ~((1L << 46) - 1), + ~((1L << 46)), + ~((1L << 47) - 1), + ~((1L << 47)), + ~((1L << 48) - 1), + ~((1L << 48)), + ~((1L << 49) - 1), + ~((1L << 49)), + ~((1L << 50) - 1), + ~((1L << 50)), + ~((1L << 51) - 1), + ~((1L << 51)), + ~((1L << 52) - 1), + ~((1L << 52)), + ~((1L << 53) - 1), + ~((1L << 53)), + ~((1L << 54) - 1), + ~((1L << 54)), + ~((1L << 55) - 1), + ~((1L << 55)), + ~((1L << 56) - 1), + ~((1L << 56)), + ~((1L << 57) - 1), + ~((1L << 57)), + ~((1L << 58) - 1), + ~((1L << 58)), + ~((1L << 59) - 1), + ~((1L << 59)), + ~((1L << 60) - 1), + ~((1L << 60)), + ~((1L << 61) - 1), + ~((1L << 61)), + ~((1L << 62) - 1), + ~((1L << 62)), + ~((1L << 63) - 1), + }; + Double[] testDoubles = new Double[]{ null, + Double.NEGATIVE_INFINITY, + -Double.MAX_VALUE, + -1e+200, + -1e3, + -1e0, + -1e-3, + -1e-200, + -Double.MIN_VALUE, + -0.0, + 0.0, + Double.MIN_VALUE, + 1e-200, + 1e-3, + 1e0, + 1e3, + 1e+200, + Double.MAX_VALUE, + Double.POSITIVE_INFINITY, + Double.NaN }; + Float[] testFloats = new Float[]{ null, + Float.NEGATIVE_INFINITY, + -Float.MAX_VALUE, + -1e+30f, + -1e3f, + -1e0f, + -1e-3f, + -1e-30f, + -Float.MIN_VALUE, + -0.0f, + 0.0f, + Float.MIN_VALUE, + 1e-30f, + 1e-3f, + 1e0f, + 1e3f, + 1e+30f, + Float.MAX_VALUE, + Float.POSITIVE_INFINITY, + Float.NaN }; + Boolean[] testBools = new Boolean[]{ null, false, true }; + UUID[] testUUIDs = new UUID[]{ null, + TimeUUID.Generator.nextTimeAsUUID(), + UUID.randomUUID(), + UUID.randomUUID(), + UUID.randomUUID(), + TimeUUID.Generator.atUnixMillis(123, 234).asUUID(), + TimeUUID.Generator.atUnixMillis(123, 234).asUUID(), + TimeUUID.Generator.atUnixMillis(123).asUUID(), + UUID.fromString("6ba7b811-9dad-11d1-80b4-00c04fd430c8"), + UUID.fromString("6ba7b810-9dad-11d1-80b4-00c04fd430c8"), + UUID.fromString("e902893a-9d22-3c7e-a7b8-d6e313b71d9f"), + UUID.fromString("74738ff5-5367-5958-9aee-98fffdcd1876"), + UUID.fromString("52df1bb0-6a2f-11e6-b6e4-a6dea7a01b67"), + UUID.fromString("52df1bb0-6a2f-11e6-362d-aff2143498ea"), + UUID.fromString("52df1bb0-6a2f-11e6-b62d-aff2143498ea") }; + // Instant.MIN/MAX fail Date.from. + Date[] testDates = new Date[]{ null, + Date.from(Instant.ofEpochSecond(Integer.MIN_VALUE)), + Date.from(Instant.ofEpochSecond(Short.MIN_VALUE)), + Date.from(Instant.ofEpochMilli(-2000)), + Date.from(Instant.EPOCH), + Date.from(Instant.ofEpochMilli(2000)), + Date.from(Instant.ofEpochSecond(Integer.MAX_VALUE)), + Date.from(Instant.now()) }; + InetAddress[] testInets; + { + try + { + testInets = new InetAddress[]{ null, + InetAddress.getLocalHost(), + InetAddress.getLoopbackAddress(), + InetAddress.getByName("192.168.0.1"), + InetAddress.getByName("fe80::428d:5cff:fe53:1dc9"), + InetAddress.getByName("2001:610:3:200a:192:87:36:2"), + InetAddress.getByName("10.0.0.1"), + InetAddress.getByName("0a00:0001::"), + InetAddress.getByName("::10.0.0.1") }; + } + catch (UnknownHostException e) + { + throw Throwables.propagate(e); + } + } + + BigInteger[] testBigInts; + + { + Set bigs = new TreeSet<>(); + for (Long l : testLongs) + if (l != null) + bigs.add(BigInteger.valueOf(l)); + for (int i = 0; i < 11; ++i) + { + bigs.add(BigInteger.valueOf(i)); + bigs.add(BigInteger.valueOf(-i)); + + bigs.add(BigInteger.valueOf((1L << 4 * i) - 1)); + bigs.add(BigInteger.valueOf((1L << 4 * i))); + bigs.add(BigInteger.valueOf(-(1L << 4 * i) - 1)); + bigs.add(BigInteger.valueOf(-(1L << 4 * i))); + String p = exp10(i); + bigs.add(new BigInteger(p)); + bigs.add(new BigInteger("-" + p)); + p = exp10(1 << i); + bigs.add(new BigInteger(p)); + bigs.add(new BigInteger("-" + p)); + + BigInteger base = BigInteger.ONE.shiftLeft(512 * i); + bigs.add(base); + bigs.add(base.add(BigInteger.ONE)); + bigs.add(base.subtract(BigInteger.ONE)); + base = base.negate(); + bigs.add(base); + bigs.add(base.add(BigInteger.ONE)); + bigs.add(base.subtract(BigInteger.ONE)); + } + testBigInts = bigs.toArray(new BigInteger[0]); + } + + static String exp10(int pow) + { + StringBuilder builder = new StringBuilder(); + builder.append('1'); + for (int i=0; i decs = new ArrayList<>(); + for (String s : vals.split(", ")) + { + decs.add(new BigDecimal(s)); + decs.add(new BigDecimal("-" + s)); + } + testBigDecimals = decs.toArray(new BigDecimal[0]); + } + + Object[][] testValues = new Object[][]{ testStrings, + testInts, + testBools, + testDoubles, + testBigInts, + testBigDecimals }; + + AbstractType[] testTypes = new AbstractType[]{ UTF8Type.instance, + Int32Type.instance, + BooleanType.instance, + DoubleType.instance, + IntegerType.instance, + DecimalType.instance }; +} diff --git a/test/unit/org/apache/cassandra/utils/bytecomparable/DecoratedKeyByteSourceTest.java b/test/unit/org/apache/cassandra/utils/bytecomparable/DecoratedKeyByteSourceTest.java new file mode 100644 index 000000000000..9a39550c3f30 --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/bytecomparable/DecoratedKeyByteSourceTest.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.utils.bytecomparable; + +import java.nio.ByteBuffer; +import java.util.Random; + +import com.google.common.collect.ImmutableList; +import org.junit.Assert; +import org.junit.Test; +import org.junit.runner.RunWith; +import org.junit.runners.Parameterized; + +import org.apache.cassandra.db.BufferDecoratedKey; +import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.dht.ByteOrderedPartitioner; + +@RunWith(Parameterized.class) +public class DecoratedKeyByteSourceTest +{ + private static final int NUM_ITERATIONS = 100; + private static final int RANDOM_BYTES_LENGTH = 100; + + @Parameterized.Parameters(name = "version={0}") + public static Iterable versions() + { + return ImmutableList.of(ByteComparable.Version.OSS42); + } + + private final ByteComparable.Version version; + + public DecoratedKeyByteSourceTest(ByteComparable.Version version) + { + this.version = version; + } + + @Test + public void testDecodeBufferDecoratedKey() + { + for (int i = 0; i < NUM_ITERATIONS; ++i) + { + BufferDecoratedKey initialBuffer = + (BufferDecoratedKey) ByteOrderedPartitioner.instance.decorateKey(newRandomBytesBuffer()); + BufferDecoratedKey decodedBuffer = BufferDecoratedKey.fromByteComparable( + initialBuffer, version, ByteOrderedPartitioner.instance); + Assert.assertEquals(initialBuffer, decodedBuffer); + } + } + + @Test + public void testDecodeKeyBytes() + { + for (int i = 0; i < NUM_ITERATIONS; ++i) + { + BufferDecoratedKey initialBuffer = + (BufferDecoratedKey) ByteOrderedPartitioner.instance.decorateKey(newRandomBytesBuffer()); + ByteSource.Peekable src = ByteSource.peekable(initialBuffer.asComparableBytes(version)); + byte[] keyBytes = DecoratedKey.keyFromByteSource(src, version, ByteOrderedPartitioner.instance); + Assert.assertEquals(ByteSource.END_OF_STREAM, src.next()); + Assert.assertArrayEquals(initialBuffer.getKey().array(), keyBytes); + } + } + + private static ByteBuffer newRandomBytesBuffer() + { + byte[] randomBytes = new byte[RANDOM_BYTES_LENGTH]; + new Random().nextBytes(randomBytes); + return ByteBuffer.wrap(randomBytes); + } +} From 7119cf8be1c32fa6ac015456965ae27a17cd828c Mon Sep 17 00:00:00 2001 From: Jyothsna Konisa Date: Thu, 30 Jun 2022 09:52:50 -0700 Subject: [PATCH 020/159] Fix TestGossipingPropertyFileSnitch.test_prefer_local_reconnect_on_listen_address patch by Jyothsna Konisa; reviewed by Jon Meredith, Yifan Cai for CASSANDRA-17700 --- CHANGES.txt | 1 + .../cassandra/auth/IInternodeAuthenticator.java | 11 +++++++++-- .../cassandra/locator/ReconnectableSnitchHelper.java | 5 ++++- .../cassandra/net/InboundConnectionSettings.java | 10 ---------- .../cassandra/net/OutboundConnectionInitiator.java | 10 ++++++++++ .../cassandra/net/OutboundConnectionSettings.java | 6 ------ .../test/InternodeEncryptionEnforcementTest.java | 4 ++++ 7 files changed, 28 insertions(+), 19 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 0893fbef64d0..99f64b06a6aa 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Fix TestGossipingPropertyFileSnitch.test_prefer_local_reconnect_on_listen_address (CASSANDRA-17700) * Add ByteComparable API (CASSANDRA-6936) * Add guardrail for maximum replication factor (CASSANDRA-17500) * Increment CQLSH to version 6.2.0 for release 4.2 (CASSANDRA-17646) diff --git a/src/java/org/apache/cassandra/auth/IInternodeAuthenticator.java b/src/java/org/apache/cassandra/auth/IInternodeAuthenticator.java index 02745fe925b2..e5038c09447c 100644 --- a/src/java/org/apache/cassandra/auth/IInternodeAuthenticator.java +++ b/src/java/org/apache/cassandra/auth/IInternodeAuthenticator.java @@ -82,11 +82,18 @@ default void setupInternode() } /** - * Enum that represents connection type of an internode connection. + * Enum that represents connection type of internode connection. + * + * INBOUND - called after connection established, with certificate available if present. + * OUTBOUND - called after connection established, with certificate available if present. + * OUTBOUND_PRECONNECT - called before initiating a connection, without certificate available. + * The outbound connection will be authenticated with the certificate once a redirected connection is established. + * This is an extra check that can be used to detect misconfiguration before reconnection, or ignored by returning true. */ enum InternodeConnectionDirection { INBOUND, - OUTBOUND + OUTBOUND, + OUTBOUND_PRECONNECT } } diff --git a/src/java/org/apache/cassandra/locator/ReconnectableSnitchHelper.java b/src/java/org/apache/cassandra/locator/ReconnectableSnitchHelper.java index b950ec34ac45..4ff726c4654d 100644 --- a/src/java/org/apache/cassandra/locator/ReconnectableSnitchHelper.java +++ b/src/java/org/apache/cassandra/locator/ReconnectableSnitchHelper.java @@ -30,6 +30,8 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import static org.apache.cassandra.auth.IInternodeAuthenticator.InternodeConnectionDirection.OUTBOUND_PRECONNECT; + /** * Sidekick helper for snitches that want to reconnect from one IP addr for a node to another. * Typically, this is for situations like EC2 where a node will have a public address and a private address, @@ -64,7 +66,8 @@ private void reconnect(InetAddressAndPort publicAddress, VersionedValue localAdd @VisibleForTesting static void reconnect(InetAddressAndPort publicAddress, InetAddressAndPort localAddress, IEndpointSnitch snitch, String localDc) { - if (!new OutboundConnectionSettings(publicAddress, localAddress).withDefaults(ConnectionCategory.MESSAGING).authenticate()) + final OutboundConnectionSettings settings = new OutboundConnectionSettings(publicAddress, localAddress).withDefaults(ConnectionCategory.MESSAGING); + if (!settings.authenticator().authenticate(settings.to.getAddress(), settings.to.getPort(), null, OUTBOUND_PRECONNECT)) { logger.debug("InternodeAuthenticator said don't reconnect to {} on {}", publicAddress, localAddress); return; diff --git a/src/java/org/apache/cassandra/net/InboundConnectionSettings.java b/src/java/org/apache/cassandra/net/InboundConnectionSettings.java index 2eab9bcb210e..44c2c4962f1c 100644 --- a/src/java/org/apache/cassandra/net/InboundConnectionSettings.java +++ b/src/java/org/apache/cassandra/net/InboundConnectionSettings.java @@ -71,16 +71,6 @@ public InboundConnectionSettings() this(null, null, null, null, null, null, null, null, null); } - public boolean authenticate(InetAddressAndPort endpoint) - { - return authenticator.authenticate(endpoint.getAddress(), endpoint.getPort()); - } - - public boolean authenticate(InetAddress address, int port) - { - return authenticator.authenticate(address, port); - } - public String toString() { return format("address: (%s), nic: %s, encryption: %s", diff --git a/src/java/org/apache/cassandra/net/OutboundConnectionInitiator.java b/src/java/org/apache/cassandra/net/OutboundConnectionInitiator.java index 9565f54846c7..7e38dd8812da 100644 --- a/src/java/org/apache/cassandra/net/OutboundConnectionInitiator.java +++ b/src/java/org/apache/cassandra/net/OutboundConnectionInitiator.java @@ -59,10 +59,12 @@ import org.apache.cassandra.security.ISslContextFactory; import org.apache.cassandra.security.SSLFactory; import org.apache.cassandra.utils.JVMStabilityInspector; +import org.apache.cassandra.utils.concurrent.ImmediateFuture; import org.apache.cassandra.utils.memory.BufferPools; import static java.util.concurrent.TimeUnit.*; import static org.apache.cassandra.auth.IInternodeAuthenticator.InternodeConnectionDirection.OUTBOUND; +import static org.apache.cassandra.auth.IInternodeAuthenticator.InternodeConnectionDirection.OUTBOUND_PRECONNECT; import static org.apache.cassandra.net.InternodeConnectionUtils.DISCARD_HANDLER_NAME; import static org.apache.cassandra.net.InternodeConnectionUtils.SSL_HANDLER_NAME; import static org.apache.cassandra.net.InternodeConnectionUtils.certificates; @@ -137,6 +139,14 @@ private Future> initiate(EventLoop eventLoop) if (logger.isTraceEnabled()) logger.trace("creating outbound bootstrap to {}, requestVersion: {}", settings, requestMessagingVersion); + if (!settings.authenticator.authenticate(settings.to.getAddress(), settings.to.getPort(), null, OUTBOUND_PRECONNECT)) + { + // interrupt other connections, so they must attempt to re-authenticate + MessagingService.instance().interruptOutbound(settings.to); + return ImmediateFuture.failure(new IOException("authentication failed to " + settings.connectToId())); + } + + // this is a bit ugly, but is the easiest way to ensure that if we timeout we can propagate a suitable error message // and still guarantee that, if on timing out we raced with success, the successfully created channel is handled AtomicBoolean timedout = new AtomicBoolean(); diff --git a/src/java/org/apache/cassandra/net/OutboundConnectionSettings.java b/src/java/org/apache/cassandra/net/OutboundConnectionSettings.java index db2873d93461..bcb6064552cd 100644 --- a/src/java/org/apache/cassandra/net/OutboundConnectionSettings.java +++ b/src/java/org/apache/cassandra/net/OutboundConnectionSettings.java @@ -25,7 +25,6 @@ import org.apache.cassandra.auth.IInternodeAuthenticator; import org.apache.cassandra.config.Config; import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.config.EncryptionOptions; import org.apache.cassandra.config.EncryptionOptions.ServerEncryptionOptions; import org.apache.cassandra.db.SystemKeyspace; import org.apache.cassandra.locator.IEndpointSnitch; @@ -157,11 +156,6 @@ private OutboundConnectionSettings(IInternodeAuthenticator authenticator, this.endpointToVersion = endpointToVersion; } - public boolean authenticate() - { - return authenticator.authenticate(to.getAddress(), to.getPort()); - } - public boolean withEncryption() { return encryption != null; diff --git a/test/distributed/org/apache/cassandra/distributed/test/InternodeEncryptionEnforcementTest.java b/test/distributed/org/apache/cassandra/distributed/test/InternodeEncryptionEnforcementTest.java index 157aede9b7a4..c95ba5d90582 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/InternodeEncryptionEnforcementTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/InternodeEncryptionEnforcementTest.java @@ -352,6 +352,10 @@ public static class CertificateVerifyAuthenticator implements IInternodeAuthenti @Override public boolean authenticate(InetAddress remoteAddress, int remotePort, Certificate[] certificates, InternodeConnectionDirection connectionType) { + if (connectionType == InternodeConnectionDirection.OUTBOUND_PRECONNECT) + { + return true; + } try { // Check if the presented certificates during internode authentication are the ones in the keystores From 33a9093c5cc2f8fcf913d1931415b697e52ec108 Mon Sep 17 00:00:00 2001 From: Eduard Tudenhoefner Date: Fri, 1 Jul 2022 08:35:04 +0200 Subject: [PATCH 021/159] Allow sstableloader to specify table without relying on path patch by Eduard Tudenhoefner; reviewed by Stefan Miklosovic, Brandon Williams for CASSANDRA-16584 --- CHANGES.txt | 1 + .../cassandra/io/sstable/Descriptor.java | 89 ++++++++++++++----- .../apache/cassandra/io/sstable/SSTable.java | 22 +++++ .../cassandra/io/sstable/SSTableLoader.java | 18 +++- .../apache/cassandra/tools/BulkLoader.java | 4 +- .../apache/cassandra/tools/LoaderOptions.java | 27 +++++- .../io/sstable/SSTableLoaderTest.java | 86 +++++++++++------- 7 files changed, 191 insertions(+), 56 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 99f64b06a6aa..e5cbc8ea68c8 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Allow sstableloader to specify table without relying on path (CASSANDRA-16584) * Fix TestGossipingPropertyFileSnitch.test_prefer_local_reconnect_on_listen_address (CASSANDRA-17700) * Add ByteComparable API (CASSANDRA-6936) * Add guardrail for maximum replication factor (CASSANDRA-17500) diff --git a/src/java/org/apache/cassandra/io/sstable/Descriptor.java b/src/java/org/apache/cassandra/io/sstable/Descriptor.java index 83bafd4ff4ab..589e46b015a4 100644 --- a/src/java/org/apache/cassandra/io/sstable/Descriptor.java +++ b/src/java/org/apache/cassandra/io/sstable/Descriptor.java @@ -218,6 +218,7 @@ public static Descriptor fromFilename(File file) /** * Parse a sstable filename, extracting both the {@code Descriptor} and {@code Component} part. + * The keyspace/table name will be extracted from the directory path. * * @param file the {@code File} object for the filename to parse. * @return a pair of the descriptor and component corresponding to the provided {@code file}. @@ -233,6 +234,58 @@ public static Pair fromFilenameWithComponent(File file) if (!file.isAbsolute()) file = file.toAbsolute(); + SSTableInfo info = validateAndExtractInfo(file); + String name = file.name(); + + File directory = parentOf(name, file); + File tableDir = directory; + + // Check if it's a 2ndary index directory (not that it doesn't exclude it to be also a backup or snapshot) + String indexName = ""; + if (tableDir.name().startsWith(Directories.SECONDARY_INDEX_NAME_SEPARATOR)) + { + indexName = tableDir.name(); + tableDir = parentOf(name, tableDir); + } + + // Then it can be a backup or a snapshot + if (tableDir.name().equals(Directories.BACKUPS_SUBDIR)) + tableDir = tableDir.parent(); + else if (parentOf(name, tableDir).name().equals(Directories.SNAPSHOT_SUBDIR)) + tableDir = parentOf(name, parentOf(name, tableDir)); + + String table = tableDir.name().split("-")[0] + indexName; + String keyspace = parentOf(name, tableDir).name(); + + return Pair.create(new Descriptor(info.version, directory, keyspace, table, info.id, info.format), info.component); + } + + /** + * Parse a sstable filename, extracting both the {@code Descriptor} and {@code Component} part. + * + * @param file the {@code File} object for the filename to parse. + * @param keyspace The keyspace name of the file. If null, then the keyspace name will be extracted + * from the directory path. + * @param table The table name of the file. If null, then the table name will be extracted from the + * directory path. + * @return a pair of the descriptor and component corresponding to the provided {@code file}. + * @throws IllegalArgumentException if the provided {@code file} does point to a valid sstable filename. This could + * mean either that the filename doesn't look like a sstable file, or that it is for an old and unsupported + * versions. + */ + public static Pair fromFilenameWithComponent(File file, String keyspace, String table) + { + if (null == keyspace || null == table) + { + return fromFilenameWithComponent(file); + } + + SSTableInfo info = validateAndExtractInfo(file); + return Pair.create(new Descriptor(info.version, parentOf(file.name(), file), keyspace, table, info.id, info.format), info.component); + } + + private static SSTableInfo validateAndExtractInfo(File file) + { String name = file.name(); List tokens = filenameSplitter.splitToList(name); int size = tokens.size(); @@ -245,9 +298,7 @@ public static Pair fromFilenameWithComponent(File file) // Note that we assume it's an old format sstable if it has the right number of tokens: this is not perfect // but we're just trying to be helpful, not perfect. if (size == 5 || size == 6) - throw new IllegalArgumentException(String.format("%s is of version %s which is now unsupported and cannot be read.", - name, - tokens.get(size - 3))); + throw new IllegalArgumentException(String.format("%s is of version %s which is now unsupported and cannot be read.", name, tokens.get(size - 3))); throw new IllegalArgumentException(String.format("Invalid sstable file %s: the name doesn't look like a supported sstable file name", name)); } @@ -282,27 +333,23 @@ public static Pair fromFilenameWithComponent(File file) if (!version.isCompatible()) throw invalidSSTable(name, "incompatible sstable version (%s); you should have run upgradesstables before upgrading", versionString); - File directory = parentOf(name, file); - File tableDir = directory; + return new SSTableInfo(version, id, format, component); + } - // Check if it's a 2ndary index directory (not that it doesn't exclude it to be also a backup or snapshot) - String indexName = ""; - if (Directories.isSecondaryIndexFolder(tableDir)) + private static class SSTableInfo + { + final Version version; + final SSTableId id; + final SSTableFormat.Type format; + final Component component; + + SSTableInfo(Version version, SSTableId id, SSTableFormat.Type format, Component component) { - indexName = tableDir.name(); - tableDir = parentOf(name, tableDir); + this.version = version; + this.id = id; + this.format = format; + this.component = component; } - - // Then it can be a backup or a snapshot - if (tableDir.name().equals(Directories.BACKUPS_SUBDIR)) - tableDir = tableDir.parent(); - else if (parentOf(name, tableDir).name().equals(Directories.SNAPSHOT_SUBDIR)) - tableDir = parentOf(name, parentOf(name, tableDir)); - - String table = tableDir.name().split("-")[0] + indexName; - String keyspace = parentOf(name, tableDir).name(); - - return Pair.create(new Descriptor(version, directory, keyspace, table, id, format), component); } private static File parentOf(String name, File file) diff --git a/src/java/org/apache/cassandra/io/sstable/SSTable.java b/src/java/org/apache/cassandra/io/sstable/SSTable.java index 488a7dc45a0b..6a691b1f0d00 100644 --- a/src/java/org/apache/cassandra/io/sstable/SSTable.java +++ b/src/java/org/apache/cassandra/io/sstable/SSTable.java @@ -216,6 +216,28 @@ public static Pair tryComponentFromFilename(File file) } } + /** + * Parse a sstable filename into both a {@link Descriptor} and {@code Component} object. + * + * @param file the filename to parse. + * @param keyspace The keyspace name of the file. + * @param table The table name of the file. + * @return a pair of the {@code Descriptor} and {@code Component} corresponding to {@code file} if it corresponds to + * a valid and supported sstable filename, {@code null} otherwise. Note that components of an unknown type will be + * returned as CUSTOM ones. + */ + public static Pair tryComponentFromFilename(File file, String keyspace, String table) + { + try + { + return Descriptor.fromFilenameWithComponent(file, keyspace, table); + } + catch (Throwable e) + { + return null; + } + } + /** * Parse a sstable filename into a {@link Descriptor} object. *

    diff --git a/src/java/org/apache/cassandra/io/sstable/SSTableLoader.java b/src/java/org/apache/cassandra/io/sstable/SSTableLoader.java index 3d9e0f4c170b..71bd025db845 100644 --- a/src/java/org/apache/cassandra/io/sstable/SSTableLoader.java +++ b/src/java/org/apache/cassandra/io/sstable/SSTableLoader.java @@ -48,6 +48,7 @@ public class SSTableLoader implements StreamEventHandler { private final File directory; private final String keyspace; + private final String table; private final Client client; private final int connectionsPerHost; private final OutputHandler outputHandler; @@ -62,9 +63,15 @@ public SSTableLoader(File directory, Client client, OutputHandler outputHandler) } public SSTableLoader(File directory, Client client, OutputHandler outputHandler, int connectionsPerHost, String targetKeyspace) + { + this(directory, client, outputHandler, connectionsPerHost, targetKeyspace, null); + } + + public SSTableLoader(File directory, Client client, OutputHandler outputHandler, int connectionsPerHost, String targetKeyspace, String targetTable) { this.directory = directory; this.keyspace = targetKeyspace != null ? targetKeyspace : directory.parent().name(); + this.table = targetTable; this.client = client; this.outputHandler = outputHandler; this.connectionsPerHost = connectionsPerHost; @@ -87,7 +94,16 @@ protected Collection openSSTables(final Map p = SSTable.tryComponentFromFilename(file); + Pair p; + if (null != keyspace && null != table) + { + p = SSTable.tryComponentFromFilename(file, keyspace, table); + } + else + { + p = SSTable.tryComponentFromFilename(file); + } + Descriptor desc = p == null ? null : p.left; if (p == null || !p.right.equals(Component.DATA)) return false; diff --git a/src/java/org/apache/cassandra/tools/BulkLoader.java b/src/java/org/apache/cassandra/tools/BulkLoader.java index 811df7ab9704..a3a296b97ac1 100644 --- a/src/java/org/apache/cassandra/tools/BulkLoader.java +++ b/src/java/org/apache/cassandra/tools/BulkLoader.java @@ -66,7 +66,8 @@ public static void load(LoaderOptions options) throws BulkLoadException buildSSLOptions(options.clientEncOptions)), handler, options.connectionsPerHost, - options.targetKeyspace); + options.targetKeyspace, + options.targetTable); DatabaseDescriptor.setStreamThroughputOutboundMegabitsPerSec(options.throttle); DatabaseDescriptor.setInterDCStreamThroughputOutboundMegabitsPerSec(options.interDcThrottle); StreamResultFuture future = null; @@ -82,7 +83,6 @@ public static void load(LoaderOptions options) throws BulkLoadException { future = loader.stream(options.ignores, indicator); } - } catch (Exception e) { diff --git a/src/java/org/apache/cassandra/tools/LoaderOptions.java b/src/java/org/apache/cassandra/tools/LoaderOptions.java index d882e5a853d4..27d54a7414f7 100644 --- a/src/java/org/apache/cassandra/tools/LoaderOptions.java +++ b/src/java/org/apache/cassandra/tools/LoaderOptions.java @@ -66,6 +66,7 @@ public class LoaderOptions public static final String ENTIRE_SSTABLE_INTER_DC_THROTTLE_MBITS = "entire-sstable-inter-dc-throttle"; public static final String TOOL_NAME = "sstableloader"; public static final String TARGET_KEYSPACE = "target-keyspace"; + public static final String TARGET_TABLE = "target-table"; /* client encryption options */ public static final String SSL_TRUSTSTORE = "truststore"; @@ -97,6 +98,7 @@ public class LoaderOptions public final Set hosts; public final Set ignores; public final String targetKeyspace; + public final String targetTable; LoaderOptions(Builder builder) { @@ -120,6 +122,7 @@ public class LoaderOptions hosts = builder.hosts; ignores = builder.ignores; targetKeyspace = builder.targetKeyspace; + targetTable = builder.targetTable; } static class Builder @@ -147,6 +150,7 @@ static class Builder Set hosts = new HashSet<>(); Set ignores = new HashSet<>(); String targetKeyspace; + String targetTable; Builder() { @@ -328,6 +332,18 @@ public Builder ignoreAndInternalPorts(InetAddressAndPort ignore) return this; } + public Builder targetKeyspace(String keyspace) + { + this.targetKeyspace = keyspace; + return this; + } + + public Builder targetTable(String table) + { + this.targetKeyspace = table; + return this; + } + public Builder parseArgs(String cmdArgs[]) { CommandLineParser parser = new GnuParser(); @@ -566,10 +582,16 @@ public Builder parseArgs(String cmdArgs[]) { targetKeyspace = cmd.getOptionValue(TARGET_KEYSPACE); if (StringUtils.isBlank(targetKeyspace)) - { errorMsg("Empty keyspace is not supported.", options); - } } + + if (cmd.hasOption(TARGET_TABLE)) + { + targetTable = cmd.getOptionValue(TARGET_TABLE); + if (StringUtils.isBlank(targetTable)) + errorMsg("Empty table is not supported.", options); + } + return this; } catch (ParseException | ConfigurationException | MalformedURLException e) @@ -678,6 +700,7 @@ private static CmdLineOptions getCmdLineOptions() options.addOption("ciphers", SSL_CIPHER_SUITES, "CIPHER-SUITES", "Client SSL: comma-separated list of encryption suites to use"); options.addOption("f", CONFIG_PATH, "path to config file", "cassandra.yaml file path for streaming throughput and client/server SSL."); options.addOption("k", TARGET_KEYSPACE, "target keyspace name", "target keyspace name"); + options.addOption("tb", TARGET_TABLE, "target table name", "target table name"); return options; } diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableLoaderTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableLoaderTest.java index c941a81db241..0af6d24a0a52 100644 --- a/test/unit/org/apache/cassandra/io/sstable/SSTableLoaderTest.java +++ b/test/unit/org/apache/cassandra/io/sstable/SSTableLoaderTest.java @@ -24,6 +24,7 @@ import com.google.common.io.Files; +import org.apache.cassandra.db.rows.Row; import org.apache.cassandra.io.util.File; import org.junit.After; import org.junit.Before; @@ -40,7 +41,6 @@ import org.apache.cassandra.db.partitions.*; import org.apache.cassandra.db.marshal.AsciiType; import org.apache.cassandra.io.FSWriteError; -import org.apache.cassandra.io.util.FileUtils; import org.apache.cassandra.schema.KeyspaceParams; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.streaming.StreamEvent; @@ -93,16 +93,33 @@ public void setup() throws Exception @After public void cleanup() { - try { - FileUtils.deleteRecursive(tmpdir); - } catch (FSWriteError e) { + try + { + tmpdir.deleteRecursive(); + } + catch (FSWriteError e) + { /* We force a GC here to force buffer deallocation, and then try deleting the directory again. For more information, see: http://bugs.java.com/bugdatabase/view_bug.do?bug_id=4715154 If this is not the problem, the exception will be rethrown anyway. */ System.gc(); - FileUtils.deleteRecursive(tmpdir); + tmpdir.deleteRecursive(); + } + + try + { + for (String[] keyspaceTable : new String[][] { {KEYSPACE1, CF_STANDARD1}, + {KEYSPACE1, CF_STANDARD2}, + {KEYSPACE1, CF_BACKUPS}, + {KEYSPACE2, CF_STANDARD1}, + {KEYSPACE2, CF_STANDARD2}}) + StorageService.instance.truncate(keyspaceTable[0], keyspaceTable[1]); + } + catch (Exception ex) + { + throw new RuntimeException("Unable to truncate table!", ex); } } @@ -150,9 +167,11 @@ public void testLoadingSSTable() throws Exception assertEquals(1, partitions.size()); assertEquals("key1", AsciiType.instance.getString(partitions.get(0).partitionKey().getKey())); assert metadata != null; - assertEquals(ByteBufferUtil.bytes("100"), partitions.get(0).getRow(Clustering.make(ByteBufferUtil.bytes("col1"))) - .getCell(metadata.getColumn(ByteBufferUtil.bytes("val"))) - .buffer()); + + Row row = partitions.get(0).getRow(Clustering.make(ByteBufferUtil.bytes("col1"))); + assert row != null; + + assertEquals(ByteBufferUtil.bytes("100"), row.getCell(metadata.getColumn(ByteBufferUtil.bytes("val"))).buffer()); // The stream future is signalled when the work is complete but before releasing references. Wait for release // before cleanup (CASSANDRA-10118). @@ -168,7 +187,7 @@ public void testLoadingIncompleteSSTable() throws Exception .inDirectory(dataDir) .forTable(String.format(schema, KEYSPACE1, CF_STANDARD2)) .using(String.format(query, KEYSPACE1, CF_STANDARD2)) - .withBufferSizeInMB(1) + .withBufferSizeInMiB(1) .build(); int NB_PARTITIONS = 5000; // Enough to write >1MiB and get at least one completed sstable before we've closed the writer @@ -209,10 +228,9 @@ public void testLoadingIncompleteSSTable() throws Exception } @Test - public void testLoadingSSTableToDifferentKeyspace() throws Exception + public void testLoadingSSTableToDifferentKeyspaceAndTable() throws Exception { - File dataDir = new File(tmpdir.absolutePath() + File.pathSeparator() + KEYSPACE1 + File.pathSeparator() + CF_STANDARD1); - assert dataDir.tryCreateDirectories(); + File dataDir = dataDir(CF_STANDARD1); TableMetadata metadata = Schema.instance.getTableMetadata(KEYSPACE1, CF_STANDARD1); String schema = "CREATE TABLE %s.%s (key ascii, name ascii, val ascii, val1 ascii, PRIMARY KEY (key, name))"; @@ -230,25 +248,31 @@ public void testLoadingSSTableToDifferentKeyspace() throws Exception ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_STANDARD1); Util.flush(cfs); // wait for sstables to be on disk else we won't be able to stream them - final CountDownLatch latch = new CountDownLatch(1); - SSTableLoader loader = new SSTableLoader(dataDir, new TestClient(), new OutputHandler.SystemOutput(false, false), 1, KEYSPACE2); - loader.stream(Collections.emptySet(), completionStreamListener(latch)).get(); + for (String table : new String[] { CF_STANDARD2, null }) + { + final CountDownLatch latch = new CountDownLatch(1); + SSTableLoader loader = new SSTableLoader(dataDir, new TestClient(), new OutputHandler.SystemOutput(false, false), 1, KEYSPACE2, table); + loader.stream(Collections.emptySet(), completionStreamListener(latch)).get(); - cfs = Keyspace.open(KEYSPACE2).getColumnFamilyStore(CF_STANDARD1); - Util.flush(cfs); + String targetTable = table == null ? CF_STANDARD1 : table; + cfs = Keyspace.open(KEYSPACE2).getColumnFamilyStore(targetTable); + Util.flush(cfs); - List partitions = Util.getAll(Util.cmd(cfs).build()); + List partitions = Util.getAll(Util.cmd(cfs).build()); - assertEquals(1, partitions.size()); - assertEquals("key1", AsciiType.instance.getString(partitions.get(0).partitionKey().getKey())); - assert metadata != null; - assertEquals(ByteBufferUtil.bytes("100"), partitions.get(0).getRow(Clustering.make(ByteBufferUtil.bytes("col1"))) - .getCell(metadata.getColumn(ByteBufferUtil.bytes("val"))) - .buffer()); + assertEquals(1, partitions.size()); + assertEquals("key1", AsciiType.instance.getString(partitions.get(0).partitionKey().getKey())); + assert metadata != null; - // The stream future is signalled when the work is complete but before releasing references. Wait for release - // before cleanup (CASSANDRA-10118). - latch.await(); + Row row = partitions.get(0).getRow(Clustering.make(ByteBufferUtil.bytes("col1"))); + assert row != null; + + assertEquals(ByteBufferUtil.bytes("100"), row.getCell(metadata.getColumn(ByteBufferUtil.bytes("val"))).buffer()); + + // The stream future is signalled when the work is complete but before releasing references. Wait for release + // before cleanup (CASSANDRA-10118). + latch.await(); + } } @Test @@ -278,9 +302,11 @@ public void testLoadingBackupsTable() throws Exception assertEquals(1, partitions.size()); assertEquals("key", AsciiType.instance.getString(partitions.get(0).partitionKey().getKey())); assert metadata != null; - assertEquals(ByteBufferUtil.bytes("100"), partitions.get(0).getRow(Clustering.make(ByteBufferUtil.bytes("col1"))) - .getCell(metadata.getColumn(ByteBufferUtil.bytes("val"))) - .buffer()); + + Row row = partitions.get(0).getRow(Clustering.make(ByteBufferUtil.bytes("col1"))); + assert row != null; + + assertEquals(ByteBufferUtil.bytes("100"), row.getCell(metadata.getColumn(ByteBufferUtil.bytes("val"))).buffer()); // The stream future is signalled when the work is complete but before releasing references. Wait for release // before cleanup (CASSANDRA-10118). From 89f3978dcde958fbad191b8cf628fd89ace64d7a Mon Sep 17 00:00:00 2001 From: Caleb Rackliffe Date: Mon, 27 Jun 2022 18:40:40 -0500 Subject: [PATCH 022/159] Add a virtual table that exposes currently running queries patch by Chris Lohfink; reviewed by Caleb Rackliffe and Benedict Elliott Smith for CASSANDRA-15241 Co-authored-by: Chris Lohfink Co-authored-by: Caleb Rackliffe Co-authored-by: Benedict Elliott Smith --- CHANGES.txt | 1 + .../cassandra/concurrent/DebuggableTask.java | 85 +++++++++++++ .../concurrent/ExecutionFailure.java | 61 +++++++++- .../cassandra/concurrent/FutureTask.java | 21 +++- .../cassandra/concurrent/SEPWorker.java | 52 +++++++- .../concurrent/SharedExecutorPool.java | 26 +++- .../cassandra/concurrent/TaskFactory.java | 4 + .../cassandra/db/virtual/QueriesTable.java | 94 ++++++++++++++ .../db/virtual/SystemViewsKeyspace.java | 1 + .../cassandra/service/StorageProxy.java | 115 ++++++++++++++---- .../cassandra/transport/Dispatcher.java | 68 +++++++++-- .../transport/InitialConnectionHandler.java | 5 +- .../apache/cassandra/transport/Message.java | 3 +- .../transport/messages/QueryMessage.java | 3 +- .../distributed/test/QueriesTableTest.java | 89 ++++++++++++++ 15 files changed, 581 insertions(+), 47 deletions(-) create mode 100644 src/java/org/apache/cassandra/concurrent/DebuggableTask.java create mode 100644 src/java/org/apache/cassandra/db/virtual/QueriesTable.java create mode 100644 test/distributed/org/apache/cassandra/distributed/test/QueriesTableTest.java diff --git a/CHANGES.txt b/CHANGES.txt index 4623b48c3ac7..7693eb48911d 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Add a virtual table that exposes currently running queries (CASSANDRA-15241) * Allow sstableloader to specify table without relying on path (CASSANDRA-16584) * Fix TestGossipingPropertyFileSnitch.test_prefer_local_reconnect_on_listen_address (CASSANDRA-17700) * Add ByteComparable API (CASSANDRA-6936) diff --git a/src/java/org/apache/cassandra/concurrent/DebuggableTask.java b/src/java/org/apache/cassandra/concurrent/DebuggableTask.java new file mode 100644 index 000000000000..ac04eb4c3475 --- /dev/null +++ b/src/java/org/apache/cassandra/concurrent/DebuggableTask.java @@ -0,0 +1,85 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.concurrent; + +import org.apache.cassandra.utils.Shared; + +import static org.apache.cassandra.utils.Shared.Recursive.INTERFACES; +import static org.apache.cassandra.utils.Shared.Scope.SIMULATION; + +/** + * Interface to include on a Runnable or Callable submitted to the {@link SharedExecutorPool} to provide more + * detailed diagnostics. + */ +@Shared(scope = SIMULATION, inner = INTERFACES) +public interface DebuggableTask +{ + public long creationTimeNanos(); + + public long startTimeNanos(); + + public String description(); + + interface RunnableDebuggableTask extends Runnable, DebuggableTask {} + + /** + * Wraps a {@link DebuggableTask} to include the name of the thread running it. + */ + public static class RunningDebuggableTask implements DebuggableTask + { + private final DebuggableTask task; + private final String threadId; + + public RunningDebuggableTask(String threadId, DebuggableTask task) + { + this.task = task; + this.threadId = threadId; + } + + public String threadId() + { + return threadId; + } + + public boolean hasTask() + { + return task != null; + } + + @Override + public long creationTimeNanos() + { + assert hasTask(); + return task.creationTimeNanos(); + } + + @Override + public long startTimeNanos() + { + assert hasTask(); + return task.startTimeNanos(); + } + + @Override + public String description() + { + assert hasTask(); + return task.description(); + } + } +} diff --git a/src/java/org/apache/cassandra/concurrent/ExecutionFailure.java b/src/java/org/apache/cassandra/concurrent/ExecutionFailure.java index 7fa7dcbd5466..27ab885e234e 100644 --- a/src/java/org/apache/cassandra/concurrent/ExecutionFailure.java +++ b/src/java/org/apache/cassandra/concurrent/ExecutionFailure.java @@ -21,6 +21,7 @@ import java.util.concurrent.Callable; import java.util.concurrent.Future; +import org.apache.cassandra.concurrent.DebuggableTask.RunnableDebuggableTask; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -105,6 +106,14 @@ static Runnable suppressing(WithResources withResources, Runnable wrap) return enforceOptions(withResources, wrap, false); } + /** + * @see #suppressing(WithResources, Runnable) + */ + static RunnableDebuggableTask suppressingDebuggable(WithResources withResources, RunnableDebuggableTask debuggable) + { + return enforceOptionsDebuggable(withResources, debuggable, false); + } + /** * Encapsulate the execution, propagating or suppressing any exceptions as requested. * @@ -119,7 +128,7 @@ private static Runnable enforceOptions(WithResources withResources, Runnable wra @Override public void run() { - try (Closeable close = withResources.get()) + try (@SuppressWarnings("unused") Closeable close = withResources.get()) { wrap.run(); } @@ -139,6 +148,54 @@ public String toString() }; } + /** + * @see #enforceOptions(WithResources, Runnable, boolean) + */ + private static RunnableDebuggableTask enforceOptionsDebuggable(WithResources withResources, RunnableDebuggableTask debuggable, boolean propagate) + { + return new RunnableDebuggableTask() + { + @Override + public void run() + { + try (@SuppressWarnings("unused") Closeable close = withResources.get()) + { + debuggable.run(); + } + catch (Throwable t) + { + handle(t); + if (propagate) + throw t; + } + } + + @Override + public String toString() + { + return debuggable.toString(); + } + + @Override + public long creationTimeNanos() + { + return debuggable.creationTimeNanos(); + } + + @Override + public long startTimeNanos() + { + return debuggable.startTimeNanos(); + } + + @Override + public String description() + { + return debuggable.description(); + } + }; + } + /** * See {@link #enforceOptions(WithResources, Callable)} */ @@ -158,7 +215,7 @@ static Callable enforceOptions(WithResources withResources, Callable w @Override public V call() throws Exception { - try (Closeable close = withResources.get()) + try (@SuppressWarnings("unused") Closeable close = withResources.get()) { return wrap.call(); } diff --git a/src/java/org/apache/cassandra/concurrent/FutureTask.java b/src/java/org/apache/cassandra/concurrent/FutureTask.java index 2348ff6bf88c..763884a2dad2 100644 --- a/src/java/org/apache/cassandra/concurrent/FutureTask.java +++ b/src/java/org/apache/cassandra/concurrent/FutureTask.java @@ -20,9 +20,10 @@ import java.util.concurrent.Callable; -import org.apache.cassandra.utils.concurrent.RunnableFuture; +import javax.annotation.Nullable; import org.apache.cassandra.utils.concurrent.AsyncFuture; +import org.apache.cassandra.utils.concurrent.RunnableFuture; /** * A FutureTask that utilises Cassandra's {@link AsyncFuture}, making it compatible with {@link ExecutorPlus}. @@ -31,15 +32,28 @@ public class FutureTask extends AsyncFuture implements RunnableFuture { private Callable call; + private volatile DebuggableTask debuggable; public FutureTask(Callable call) { - this.call = call; + this(call, call instanceof DebuggableTask ? (DebuggableTask) call : null); } public FutureTask(Runnable run) { - this.call = callable(run); + this(callable(run), run instanceof DebuggableTask ? (DebuggableTask) run : null); + } + + private FutureTask(Callable call, DebuggableTask debuggable) + { + this.call = call; + this.debuggable = debuggable; + } + + @Nullable + DebuggableTask debuggableTask() + { + return debuggable; } V call() throws Exception @@ -63,6 +77,7 @@ public void run() finally { call = null; + debuggable = null; } } diff --git a/src/java/org/apache/cassandra/concurrent/SEPWorker.java b/src/java/org/apache/cassandra/concurrent/SEPWorker.java index c7b9abf719ab..fe16c950dfda 100644 --- a/src/java/org/apache/cassandra/concurrent/SEPWorker.java +++ b/src/java/org/apache/cassandra/concurrent/SEPWorker.java @@ -48,6 +48,8 @@ final class SEPWorker extends AtomicReference implements Runnabl long prevStopCheck = 0; long soleSpinnerSpinTime = 0; + private final AtomicReference currentTask = new AtomicReference<>(); + SEPWorker(ThreadGroup threadGroup, Long workerId, Work initialState, SharedExecutorPool pool) { this.pool = pool; @@ -58,9 +60,27 @@ final class SEPWorker extends AtomicReference implements Runnabl thread.start(); } + /** + * @return the current {@link DebuggableTask}, if one exists + */ + public DebuggableTask currentDebuggableTask() + { + // can change after null check so go off local reference + Runnable task = currentTask.get(); + + // Local read and mutation Runnables are themselves debuggable + if (task instanceof DebuggableTask) + return (DebuggableTask) task; + + if (task instanceof FutureTask) + return ((FutureTask) task).debuggableTask(); + + return null; + } + public void run() { - /** + /* * we maintain two important invariants: * 1) after exiting spinning phase, we ensure at least one more task on _each_ queue will be processed * promptly after we begin, assuming any are outstanding on any pools. this is to permit producers to @@ -101,8 +121,10 @@ public void run() if (assigned == null) continue; if (SET_THREAD_NAME) - Thread.currentThread().setName(assigned.name + "-" + workerId); + Thread.currentThread().setName(assigned.name + '-' + workerId); + task = assigned.tasks.poll(); + currentTask.lazySet(task); // if we do have tasks assigned, nobody will change our state so we can simply set it to WORKING // (which is also a state that will never be interrupted externally) @@ -128,9 +150,12 @@ public void run() break; task = assigned.tasks.poll(); + currentTask.lazySet(task); } // return our work permit, and maybe signal shutdown + currentTask.lazySet(null); + if (status != RETURNED_WORK_PERMIT) assigned.returnWorkPermit(); @@ -173,6 +198,11 @@ public void run() logger.error("Unexpected exception killed worker", t); } } + finally + { + currentTask.lazySet(null); + pool.workerEnded(this); + } } // try to assign this worker the provided work @@ -420,4 +450,22 @@ boolean isAssigned() return assigned != null; } } + + @Override + public String toString() + { + return thread.getName(); + } + + @Override + public int hashCode() + { + return workerId.intValue(); + } + + @Override + public boolean equals(Object obj) + { + return obj == this; + } } diff --git a/src/java/org/apache/cassandra/concurrent/SharedExecutorPool.java b/src/java/org/apache/cassandra/concurrent/SharedExecutorPool.java index f74854f9cb01..0631ec61da01 100644 --- a/src/java/org/apache/cassandra/concurrent/SharedExecutorPool.java +++ b/src/java/org/apache/cassandra/concurrent/SharedExecutorPool.java @@ -17,8 +17,11 @@ */ package org.apache.cassandra.concurrent; +import java.util.Collections; import java.util.List; import java.util.Map; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentSkipListMap; import java.util.concurrent.CopyOnWriteArrayList; import java.util.concurrent.TimeUnit; @@ -26,6 +29,9 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicLong; import java.util.concurrent.locks.LockSupport; +import java.util.stream.Collectors; + +import org.apache.cassandra.concurrent.DebuggableTask.RunningDebuggableTask; import static org.apache.cassandra.concurrent.ExecutorFactory.Global.executorFactory; import static org.apache.cassandra.concurrent.SEPWorker.Work; @@ -77,6 +83,8 @@ public class SharedExecutorPool final ConcurrentSkipListMap spinning = new ConcurrentSkipListMap<>(); // the collection of threads that have been asked to stop/deschedule - new workers are scheduled from here last final ConcurrentSkipListMap descheduled = new ConcurrentSkipListMap<>(); + // All SEPWorkers that are currently running + private final Set allWorkers = Collections.newSetFromMap(new ConcurrentHashMap<>()); volatile boolean shuttingDown = false; @@ -102,7 +110,23 @@ void schedule(Work work) return; if (!work.isStop()) - new SEPWorker(threadGroup, workerId.incrementAndGet(), work, this); + { + SEPWorker worker = new SEPWorker(threadGroup, workerId.incrementAndGet(), work, this); + allWorkers.add(worker); + } + } + + void workerEnded(SEPWorker worker) + { + allWorkers.remove(worker); + } + + public List runningTasks() + { + return allWorkers.stream() + .map(worker -> new RunningDebuggableTask(worker.toString(), worker.currentDebuggableTask())) + .filter(RunningDebuggableTask::hasTask) + .collect(Collectors.toList()); } void maybeStartSpinningWorker() diff --git a/src/java/org/apache/cassandra/concurrent/TaskFactory.java b/src/java/org/apache/cassandra/concurrent/TaskFactory.java index 56087d950b28..faeabe6c4c77 100644 --- a/src/java/org/apache/cassandra/concurrent/TaskFactory.java +++ b/src/java/org/apache/cassandra/concurrent/TaskFactory.java @@ -20,6 +20,7 @@ import java.util.concurrent.Callable; +import org.apache.cassandra.concurrent.DebuggableTask.RunnableDebuggableTask; import org.apache.cassandra.utils.Shared; import org.apache.cassandra.utils.WithResources; import org.apache.cassandra.utils.concurrent.RunnableFuture; @@ -127,6 +128,9 @@ protected LocalAware() {} @Override public Runnable toExecute(Runnable runnable) { + if (runnable instanceof RunnableDebuggableTask) + return ExecutionFailure.suppressingDebuggable(ExecutorLocals.propagate(), (RunnableDebuggableTask) runnable); + // no reason to propagate exception when it is inaccessible to caller return ExecutionFailure.suppressing(ExecutorLocals.propagate(), runnable); } diff --git a/src/java/org/apache/cassandra/db/virtual/QueriesTable.java b/src/java/org/apache/cassandra/db/virtual/QueriesTable.java new file mode 100644 index 000000000000..aeba61c004a4 --- /dev/null +++ b/src/java/org/apache/cassandra/db/virtual/QueriesTable.java @@ -0,0 +1,94 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.cassandra.db.virtual; + +import org.apache.cassandra.concurrent.DebuggableTask; +import org.apache.cassandra.concurrent.SharedExecutorPool; +import org.apache.cassandra.db.marshal.LongType; +import org.apache.cassandra.db.marshal.UTF8Type; +import org.apache.cassandra.dht.LocalPartitioner; +import org.apache.cassandra.schema.TableMetadata; + +import static java.lang.Long.max; +import static java.util.concurrent.TimeUnit.NANOSECONDS; +import static org.apache.cassandra.utils.MonotonicClock.Global.approxTime; + +/** + * Virtual table that lists currently running queries on the NTR (coordinator) and Read/Mutation (local) stages + * + * Example: + *

    + * cqlsh> SELECT * FROM system_views.queries;
    + *
    + *  thread_id                   | queued_micros |  running_micros | task
    + * ------------------------------+---------------+-----------------+--------------------------------------------------------------------------------
    + *  Native-Transport-Requests-7 |         72923 |            7611 |                      QUERY select * from system_views.queries; [pageSize = 100]
    + *              MutationStage-2 |         18249 |            2084 | Mutation(keyspace='distributed_test_keyspace', key='000000f8', modifications...
    + *                  ReadStage-2 |         72447 |           10121 |                                         SELECT * FROM keyspace.table LIMIT 5000
    + * 
    + */ +final class QueriesTable extends AbstractVirtualTable +{ + private static final String TABLE_NAME = "queries"; + private static final String ID = "thread_id"; + private static final String QUEUED = "queued_micros"; + private static final String RUNNING = "running_micros"; + private static final String DESC = "task"; + + QueriesTable(String keyspace) + { + super(TableMetadata.builder(keyspace, TABLE_NAME) + .comment("Lists currently running queries") + .kind(TableMetadata.Kind.VIRTUAL) + .partitioner(new LocalPartitioner(UTF8Type.instance)) + // The thread name is unique since the id given to each SEPWorker is unique + .addPartitionKeyColumn(ID, UTF8Type.instance) + .addRegularColumn(QUEUED, LongType.instance) + .addRegularColumn(RUNNING, LongType.instance) + .addRegularColumn(DESC, UTF8Type.instance) + .build()); + } + + /** + * Walks the {@link SharedExecutorPool} workers for any {@link DebuggableTask} instances and populates the table. + */ + @Override + public DataSet data() + { + SimpleDataSet result = new SimpleDataSet(metadata()); + + for (DebuggableTask.RunningDebuggableTask task : SharedExecutorPool.SHARED.runningTasks()) + { + if (!task.hasTask()) continue; + + long creationTimeNanos = task.creationTimeNanos(); + long startTimeNanos = task.startTimeNanos(); + long now = approxTime.now(); + + long queuedMicros = NANOSECONDS.toMicros(max((startTimeNanos > 0 ? startTimeNanos : now) - creationTimeNanos, 0)); + long runningMicros = startTimeNanos > 0 ? NANOSECONDS.toMicros(now - startTimeNanos) : 0; + + result.row(task.threadId()) + .column(QUEUED, queuedMicros) + .column(RUNNING, runningMicros) + .column(DESC, task.description()); + } + + return result; + } +} diff --git a/src/java/org/apache/cassandra/db/virtual/SystemViewsKeyspace.java b/src/java/org/apache/cassandra/db/virtual/SystemViewsKeyspace.java index f13e61ce73d2..59a0aba809a6 100644 --- a/src/java/org/apache/cassandra/db/virtual/SystemViewsKeyspace.java +++ b/src/java/org/apache/cassandra/db/virtual/SystemViewsKeyspace.java @@ -47,6 +47,7 @@ private SystemViewsKeyspace() .add(new BatchMetricsTable(VIRTUAL_VIEWS)) .add(new StreamingVirtualTable(VIRTUAL_VIEWS)) .add(new GossipInfoTable(VIRTUAL_VIEWS)) + .add(new QueriesTable(VIRTUAL_VIEWS)) .addAll(LocalRepairTables.getAll(VIRTUAL_VIEWS)) .build()); } diff --git a/src/java/org/apache/cassandra/service/StorageProxy.java b/src/java/org/apache/cassandra/service/StorageProxy.java index 31d54771fea0..557382df5488 100644 --- a/src/java/org/apache/cassandra/service/StorageProxy.java +++ b/src/java/org/apache/cassandra/service/StorageProxy.java @@ -39,25 +39,25 @@ import java.util.function.Function; import java.util.stream.Collectors; -import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; import com.google.common.cache.CacheLoader; import com.google.common.collect.Iterables; import com.google.common.util.concurrent.Uninterruptibles; -import org.apache.cassandra.config.Config; -import org.apache.cassandra.service.paxos.*; +import org.apache.cassandra.service.paxos.Ballot; +import org.apache.cassandra.service.paxos.Commit; +import org.apache.cassandra.service.paxos.ContentionStrategy; import org.apache.cassandra.service.paxos.Paxos; -import org.apache.cassandra.utils.TimeUUID; -import org.apache.cassandra.utils.concurrent.CountDownLatch; - +import org.apache.cassandra.service.paxos.PaxosState; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.cassandra.batchlog.Batch; import org.apache.cassandra.batchlog.BatchlogManager; +import org.apache.cassandra.concurrent.DebuggableTask.RunnableDebuggableTask; import org.apache.cassandra.concurrent.Stage; import org.apache.cassandra.config.CassandraRelevantProperties; +import org.apache.cassandra.config.Config; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.ConsistencyLevel; @@ -140,13 +140,17 @@ import org.apache.cassandra.utils.MonotonicClock; import org.apache.cassandra.utils.NoSpamLogger; import org.apache.cassandra.utils.Pair; +import org.apache.cassandra.utils.TimeUUID; +import org.apache.cassandra.utils.concurrent.CountDownLatch; import org.apache.cassandra.utils.concurrent.UncheckedInterruptedException; -import static com.google.common.collect.Iterables.concat; import static java.util.concurrent.TimeUnit.MILLISECONDS; import static java.util.concurrent.TimeUnit.NANOSECONDS; + +import static com.google.common.collect.Iterables.concat; +import static org.apache.commons.lang3.StringUtils.join; + import static org.apache.cassandra.db.ConsistencyLevel.SERIAL; -import static org.apache.cassandra.net.Message.out; import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.casReadMetrics; import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.casWriteMetrics; import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.readMetrics; @@ -154,8 +158,15 @@ import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.viewWriteMetrics; import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.writeMetrics; import static org.apache.cassandra.metrics.ClientRequestsMetricsHolder.writeMetricsForLevel; +import static org.apache.cassandra.net.Message.out; import static org.apache.cassandra.net.NoPayload.noPayload; -import static org.apache.cassandra.net.Verb.*; +import static org.apache.cassandra.net.Verb.BATCH_STORE_REQ; +import static org.apache.cassandra.net.Verb.MUTATION_REQ; +import static org.apache.cassandra.net.Verb.PAXOS_COMMIT_REQ; +import static org.apache.cassandra.net.Verb.PAXOS_PREPARE_REQ; +import static org.apache.cassandra.net.Verb.PAXOS_PROPOSE_REQ; +import static org.apache.cassandra.net.Verb.SCHEMA_VERSION_REQ; +import static org.apache.cassandra.net.Verb.TRUNCATE_REQ; import static org.apache.cassandra.service.BatchlogResponseHandler.BatchlogCleanup; import static org.apache.cassandra.service.paxos.Ballot.Flag.GLOBAL; import static org.apache.cassandra.service.paxos.Ballot.Flag.LOCAL; @@ -166,7 +177,6 @@ import static org.apache.cassandra.utils.Clock.Global.nanoTime; import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; import static org.apache.cassandra.utils.concurrent.CountDownLatch.newCountDownLatch; -import static org.apache.commons.lang3.StringUtils.join; public class StorageProxy implements StorageProxyMBean { @@ -829,6 +839,12 @@ public void runMayThrow() } } + @Override + public String description() + { + return "Paxos " + message.payload.toString(); + } + @Override protected Verb verb() { @@ -1264,7 +1280,7 @@ private static void syncWriteToBatchlog(Collection mutations, ReplicaP logger.trace("Sending batchlog store request {} to {} for {} mutations", batch.id, replica, batch.size()); if (replica.isSelf()) - performLocally(Stage.MUTATION, replica, () -> BatchlogManager.store(batch), handler); + performLocally(Stage.MUTATION, replica, () -> BatchlogManager.store(batch), handler, "Batchlog store"); else MessagingService.instance().sendWithCallback(message, replica.endpoint(), handler); } @@ -1280,7 +1296,7 @@ private static void asyncRemoveFromBatchlog(ReplicaPlan.ForWrite replicaPlan, Ti logger.trace("Sending batchlog remove request {} to {}", uuid, target); if (target.isSelf()) - performLocally(Stage.MUTATION, target, () -> BatchlogManager.remove(uuid)); + performLocally(Stage.MUTATION, target, () -> BatchlogManager.remove(uuid), "Batchlog remove"); else MessagingService.instance().send(message, target.endpoint()); } @@ -1524,7 +1540,7 @@ public static void sendToHintedReplicas(final Mutation mutation, if (insertLocal) { Preconditions.checkNotNull(localReplica); - performLocally(stage, localReplica, mutation::apply, responseHandler); + performLocally(stage, localReplica, mutation::apply, responseHandler, mutation); } if (localDc != null) @@ -1591,7 +1607,7 @@ private static void sendMessagesToNonlocalDC(Message messag logger.trace("Sending message to {}@{}", message.id(), target); } - private static void performLocally(Stage stage, Replica localReplica, final Runnable runnable) + private static void performLocally(Stage stage, Replica localReplica, final Runnable runnable, String description) { stage.maybeExecuteImmediately(new LocalMutationRunnable(localReplica) { @@ -1607,6 +1623,12 @@ public void runMayThrow() } } + @Override + public String description() + { + return description; + } + @Override protected Verb verb() { @@ -1615,7 +1637,7 @@ protected Verb verb() }); } - private static void performLocally(Stage stage, Replica localReplica, final Runnable runnable, final RequestCallback handler) + private static void performLocally(Stage stage, Replica localReplica, final Runnable runnable, final RequestCallback handler, Object description) { stage.maybeExecuteImmediately(new LocalMutationRunnable(localReplica) { @@ -1634,6 +1656,14 @@ public void runMayThrow() } } + @Override + public String description() + { + // description is an Object and toString() called so we do not have to evaluate the Mutation.toString() + // unless expliclitly checked + return description.toString(); + } + @Override protected Verb verb() { @@ -2088,7 +2118,7 @@ private static PartitionIterator fetchRows(List comm return concatAndBlockOnRepair(results, repairs); } - public static class LocalReadRunnable extends DroppableRunnable + public static class LocalReadRunnable extends DroppableRunnable implements RunnableDebuggableTask { private final ReadCommand command; private final ReadCallback handler; @@ -2158,6 +2188,24 @@ protected void runMayThrow() } } } + + @Override + public long creationTimeNanos() + { + return approxCreationTimeNanos; + } + + @Override + public long startTimeNanos() + { + return approxStartTimeNanos; + } + + @Override + public String description() + { + return command.toCQLString(); + } } public static PartitionIterator getRangeSlice(PartitionRangeReadCommand command, @@ -2468,7 +2516,9 @@ public void onResponse(Message msg) */ private static abstract class DroppableRunnable implements Runnable { - final long approxCreationTimeNanos; + protected final long approxCreationTimeNanos; + protected volatile long approxStartTimeNanos; + final Verb verb; public DroppableRunnable(Verb verb) @@ -2479,11 +2529,11 @@ public DroppableRunnable(Verb verb) public final void run() { - long approxCurrentTimeNanos = MonotonicClock.Global.approxTime.now(); + approxStartTimeNanos = MonotonicClock.Global.approxTime.now(); long expirationTimeNanos = verb.expiresAtNanos(approxCreationTimeNanos); - if (approxCurrentTimeNanos > expirationTimeNanos) + if (approxStartTimeNanos > expirationTimeNanos) { - long timeTakenNanos = approxCurrentTimeNanos - approxCreationTimeNanos; + long timeTakenNanos = approxStartTimeNanos - approxCreationTimeNanos; MessagingService.instance().metrics.recordSelfDroppedMessage(verb, timeTakenNanos, NANOSECONDS); return; } @@ -2504,9 +2554,10 @@ public final void run() * Like DroppableRunnable, but if it aborts, it will rerun (on the mutation stage) after * marking itself as a hint in progress so that the hint backpressure mechanism can function. */ - private static abstract class LocalMutationRunnable implements Runnable + private static abstract class LocalMutationRunnable implements RunnableDebuggableTask { private final long approxCreationTimeNanos = MonotonicClock.Global.approxTime.now(); + private volatile long approxStartTimeNanos; private final Replica localReplica; @@ -2518,11 +2569,12 @@ private static abstract class LocalMutationRunnable implements Runnable public final void run() { final Verb verb = verb(); - long nowNanos = MonotonicClock.Global.approxTime.now(); + approxStartTimeNanos = MonotonicClock.Global.approxTime.now(); long expirationTimeNanos = verb.expiresAtNanos(approxCreationTimeNanos); - if (nowNanos > expirationTimeNanos) + + if (approxStartTimeNanos > expirationTimeNanos) { - long timeTakenNanos = nowNanos - approxCreationTimeNanos; + long timeTakenNanos = approxStartTimeNanos - approxCreationTimeNanos; MessagingService.instance().metrics.recordSelfDroppedMessage(Verb.MUTATION_REQ, timeTakenNanos, NANOSECONDS); HintRunnable runnable = new HintRunnable(EndpointsForToken.of(localReplica.range().right, localReplica)) @@ -2546,6 +2598,21 @@ protected void runMayThrow() throws Exception } } + @Override + public long creationTimeNanos() + { + return approxCreationTimeNanos; + } + + @Override + public long startTimeNanos() + { + return approxStartTimeNanos; + } + + @Override + abstract public String description(); + abstract protected Verb verb(); abstract protected void runMayThrow() throws Exception; } diff --git a/src/java/org/apache/cassandra/transport/Dispatcher.java b/src/java/org/apache/cassandra/transport/Dispatcher.java index da79c3d2c453..8f8a607c777b 100644 --- a/src/java/org/apache/cassandra/transport/Dispatcher.java +++ b/src/java/org/apache/cassandra/transport/Dispatcher.java @@ -24,15 +24,16 @@ import java.util.function.Consumer; import com.google.common.base.Predicate; -import org.apache.cassandra.metrics.ClientMetrics; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import io.netty.channel.Channel; import io.netty.channel.EventLoop; import io.netty.util.AttributeKey; +import org.apache.cassandra.concurrent.DebuggableTask.RunnableDebuggableTask; import org.apache.cassandra.concurrent.LocalAwareExecutorPlus; import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.metrics.ClientMetrics; import org.apache.cassandra.net.FrameEncoder; import org.apache.cassandra.service.ClientWarn; import org.apache.cassandra.service.QueryState; @@ -42,10 +43,10 @@ import org.apache.cassandra.transport.messages.ErrorMessage; import org.apache.cassandra.transport.messages.EventMessage; import org.apache.cassandra.utils.JVMStabilityInspector; +import org.apache.cassandra.utils.MonotonicClock; import org.apache.cassandra.utils.NoSpamLogger; import static org.apache.cassandra.concurrent.SharedExecutorPool.SHARED; -import static org.apache.cassandra.utils.Clock.Global.nanoTime; public class Dispatcher { @@ -79,17 +80,60 @@ public Dispatcher(boolean useLegacyFlusher) public void dispatch(Channel channel, Message.Request request, FlushItemConverter forFlusher, Overload backpressure) { - requestExecutor.submit(() -> processRequest(channel, request, forFlusher, backpressure)); + requestExecutor.submit(new RequestProcessor(channel, request, forFlusher, backpressure)); ClientMetrics.instance.markRequestDispatched(); } + public class RequestProcessor implements RunnableDebuggableTask + { + private final Channel channel; + private final Message.Request request; + private final FlushItemConverter forFlusher; + private final Overload backpressure; + + private final long approxCreationTimeNanos = MonotonicClock.Global.approxTime.now(); + private volatile long approxStartTimeNanos; + + public RequestProcessor(Channel channel, Message.Request request, FlushItemConverter forFlusher, Overload backpressure) + { + this.channel = channel; + this.request = request; + this.forFlusher = forFlusher; + this.backpressure = backpressure; + } + + @Override + public void run() + { + approxStartTimeNanos = MonotonicClock.Global.approxTime.now(); + processRequest(channel, request, forFlusher, backpressure, approxStartTimeNanos); + } + + @Override + public long creationTimeNanos() + { + return approxCreationTimeNanos; + } + + @Override + public long startTimeNanos() + { + return approxStartTimeNanos; + } + + @Override + public String description() + { + return request.toString(); + } + } + /** * Note: this method may be executed on the netty event loop, during initial protocol negotiation; the caller is * responsible for cleaning up any global or thread-local state. (ex. tracing, client warnings, etc.). */ - private static Message.Response processRequest(ServerConnection connection, Message.Request request, Overload backpressure) + private static Message.Response processRequest(ServerConnection connection, Message.Request request, Overload backpressure, long startTimeNanos) { - long queryStartNanoTime = nanoTime(); if (connection.getVersion().isGreaterOrEqualTo(ProtocolVersion.V4)) ClientWarn.instance.captureWarnings(); @@ -119,7 +163,7 @@ else if (backpressure == Overload.BYTES_IN_FLIGHT) Message.logger.trace("Received: {}, v={}", request, connection.getVersion()); connection.requests.inc(); - Message.Response response = request.execute(qstate, queryStartNanoTime); + Message.Response response = request.execute(qstate, startTimeNanos); if (request.isTrackable()) CoordinatorWarnings.done(); @@ -130,15 +174,15 @@ else if (backpressure == Overload.BYTES_IN_FLIGHT) connection.applyStateTransition(request.type, response.type); return response; } - + /** * Note: this method may be executed on the netty event loop. */ - static Message.Response processRequest(Channel channel, Message.Request request, Overload backpressure) + static Message.Response processRequest(Channel channel, Message.Request request, Overload backpressure, long approxStartTimeNanos) { try { - return processRequest((ServerConnection) request.connection(), request, backpressure); + return processRequest((ServerConnection) request.connection(), request, backpressure, approxStartTimeNanos); } catch (Throwable t) { @@ -163,9 +207,9 @@ static Message.Response processRequest(Channel channel, Message.Request request, /** * Note: this method is not expected to execute on the netty event loop. */ - void processRequest(Channel channel, Message.Request request, FlushItemConverter forFlusher, Overload backpressure) + void processRequest(Channel channel, Message.Request request, FlushItemConverter forFlusher, Overload backpressure, long approxStartTimeNanos) { - Message.Response response = processRequest(channel, request, backpressure); + Message.Response response = processRequest(channel, request, backpressure, approxStartTimeNanos); FlushItem toFlush = forFlusher.toFlushItem(channel, request, response); Message.logger.trace("Responding: {}, v={}", response, request.connection().getVersion()); flush(toFlush); @@ -201,7 +245,7 @@ public static void shutdown() * for delivering events to registered clients is dependent on protocol version and the configuration * of the pipeline. For v5 and newer connections, the event message is encoded into an Envelope, * wrapped in a FlushItem and then delivered via the pipeline's flusher, in a similar way to - * a Response returned from {@link #processRequest(Channel, Message.Request, FlushItemConverter, Overload)}. + * a Response returned from {@link #processRequest(Channel, Message.Request, FlushItemConverter, Overload, long)}. * It's worth noting that events are not generally fired as a direct response to a client request, * so this flush item has a null request attribute. The dispatcher itself is created when the * pipeline is first configured during protocol negotiation and is attached to the channel for diff --git a/src/java/org/apache/cassandra/transport/InitialConnectionHandler.java b/src/java/org/apache/cassandra/transport/InitialConnectionHandler.java index 75cb72e8b582..e4cff99acbd3 100644 --- a/src/java/org/apache/cassandra/transport/InitialConnectionHandler.java +++ b/src/java/org/apache/cassandra/transport/InitialConnectionHandler.java @@ -26,6 +26,7 @@ import java.util.Map; import org.apache.cassandra.transport.ClientResourceLimits.Overload; +import org.apache.cassandra.utils.MonotonicClock; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -148,7 +149,9 @@ protected void decode(ChannelHandlerContext ctx, ByteBuf buffer, List li promise = new VoidChannelPromise(ctx.channel(), false); } - final Message.Response response = Dispatcher.processRequest(ctx.channel(), startup, Overload.NONE); + long approxStartTimeNanos = MonotonicClock.Global.approxTime.now(); + final Message.Response response = Dispatcher.processRequest(ctx.channel(), startup, Overload.NONE, approxStartTimeNanos); + outbound = response.encode(inbound.header.version); ctx.writeAndFlush(outbound, promise); logger.trace("Configured pipeline: {}", ctx.pipeline()); diff --git a/src/java/org/apache/cassandra/transport/Message.java b/src/java/org/apache/cassandra/transport/Message.java index 75c997e38c86..2c91a76c3bd2 100644 --- a/src/java/org/apache/cassandra/transport/Message.java +++ b/src/java/org/apache/cassandra/transport/Message.java @@ -193,7 +193,8 @@ public void setCustomPayload(Map customPayload) this.customPayload = customPayload; } - public String debugString() + @Override + public String toString() { return String.format("(%s:%s:%s)", type, streamId, connection == null ? "null" : connection.getVersion().asInt()); } diff --git a/src/java/org/apache/cassandra/transport/messages/QueryMessage.java b/src/java/org/apache/cassandra/transport/messages/QueryMessage.java index 9a296e442b48..c295216d2c4f 100644 --- a/src/java/org/apache/cassandra/transport/messages/QueryMessage.java +++ b/src/java/org/apache/cassandra/transport/messages/QueryMessage.java @@ -148,6 +148,7 @@ private void traceQuery(QueryState state) @Override public String toString() { - return String.format("QUERY %s [pageSize = %d]", query, options.getPageSize()); + return String.format("QUERY %s [pageSize = %d] at consistency %s", + query, options.getPageSize(), options.getConsistency()); } } diff --git a/test/distributed/org/apache/cassandra/distributed/test/QueriesTableTest.java b/test/distributed/org/apache/cassandra/distributed/test/QueriesTableTest.java new file mode 100644 index 000000000000..09e56e0b6191 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/QueriesTableTest.java @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test; + +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +import org.junit.Test; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.Row; +import org.apache.cassandra.distributed.api.SimpleQueryResult; + +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertTrue; + +public class QueriesTableTest extends TestBaseImpl +{ + public static final int ITERATIONS = 256; + + @Test + public void shouldExposeReadsAndWrites() throws Throwable + { + try (Cluster cluster = init(Cluster.build(1).start())) + { + ExecutorService executor = Executors.newFixedThreadPool(16); + + cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (k int primary key, v int)"); + + AtomicInteger reads = new AtomicInteger(0); + AtomicInteger writes = new AtomicInteger(0); + AtomicInteger paxos = new AtomicInteger(0); + + for (int i = 0; i < ITERATIONS; i++) + { + int k = i; + executor.execute(() -> cluster.coordinator(1).execute("INSERT INTO " + KEYSPACE + ".tbl (k, v) VALUES (" + k + ", 0)", ConsistencyLevel.ALL)); + executor.execute(() -> cluster.coordinator(1).execute("UPDATE " + KEYSPACE + ".tbl SET v = 10 WHERE k = " + (k - 1) + " IF v = 0", ConsistencyLevel.ALL)); + executor.execute(() -> cluster.coordinator(1).execute("SELECT * FROM " + KEYSPACE + ".tbl WHERE k = " + (k - 1), ConsistencyLevel.ALL)); + + executor.execute(() -> + { + SimpleQueryResult result = cluster.get(1).executeInternalWithResult("SELECT * FROM system_views.queries"); + + while (result.hasNext()) + { + Row row = result.next(); + String threadId = row.get("thread_id").toString(); + String task = row.get("task").toString(); + + if (threadId.contains("Read") && task.contains("SELECT")) + reads.incrementAndGet(); + else if (threadId.contains("Mutation") && task.contains("Mutation")) + writes.incrementAndGet(); + else if (threadId.contains("Mutation") && task.contains("Paxos")) + paxos.incrementAndGet(); + } + }); + } + + executor.shutdown(); + assertTrue(executor.awaitTermination(1, TimeUnit.MINUTES)); + + // We should see at least one read, write, and conditional update in the "queries" table. + assertThat(reads.get()).isGreaterThan(0).isLessThanOrEqualTo(ITERATIONS); + assertThat(writes.get()).isGreaterThan(0).isLessThanOrEqualTo(ITERATIONS); + assertThat(paxos.get()).isGreaterThan(0).isLessThanOrEqualTo(ITERATIONS); + } + } +} From 55f094a6d2c288b74e6ea86edaa7c45cd494c208 Mon Sep 17 00:00:00 2001 From: Stefan Miklosovic Date: Fri, 24 Jun 2022 12:42:12 +0200 Subject: [PATCH 023/159] Remove ephemeral snapshot marker file and introduce a flag to SnapshotManifest patch by Stefan Miklosovic; reviewed by Paulo Motta for CASSANDRA-16911 --- CHANGES.txt | 1 + NEWS.txt | 5 + .../cassandra/db/ColumnFamilyStore.java | 53 ++---- .../org/apache/cassandra/db/Directories.java | 60 ++----- .../db/SnapshotDetailsTabularData.java | 11 +- .../cassandra/service/CassandraDaemon.java | 39 +++-- .../cassandra/service/StorageService.java | 3 + .../service/snapshot/SnapshotLoader.java | 5 + .../service/snapshot/SnapshotManifest.java | 22 ++- .../service/snapshot/TableSnapshot.java | 33 +++- .../tools/nodetool/ListSnapshots.java | 19 +- .../cassandra/distributed/impl/Instance.java | 4 + .../test/EphemeralSnapshotTest.java | 164 ++++++++++++++++++ .../cassandra/db/ColumnFamilyStoreTest.java | 4 +- .../apache/cassandra/db/DirectoriesTest.java | 72 ++++---- .../service/snapshot/SnapshotLoaderTest.java | 51 +++++- .../service/snapshot/SnapshotManagerTest.java | 23 +-- .../snapshot/SnapshotManifestTest.java | 2 +- .../service/snapshot/TableSnapshotTest.java | 36 +++- 19 files changed, 425 insertions(+), 182 deletions(-) create mode 100644 test/distributed/org/apache/cassandra/distributed/test/EphemeralSnapshotTest.java diff --git a/CHANGES.txt b/CHANGES.txt index 69de78c9cb7b..b75eddd30e1f 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Remove ephemeral snapshot marker file and introduce a flag to SnapshotManifest (CASSANDRA-16911) * Add a virtual table that exposes currently running queries (CASSANDRA-15241) * Allow sstableloader to specify table without relying on path (CASSANDRA-16584) * Fix TestGossipingPropertyFileSnitch.test_prefer_local_reconnect_on_listen_address (CASSANDRA-17700) diff --git a/NEWS.txt b/NEWS.txt index 3e016b7602f9..766ef59762e6 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -70,9 +70,14 @@ New features - Whether ALTER TABLE commands are allowed to mutate columns - Whether SimpleStrategy is allowed on keyspace creation or alteration - Maximum replication factor + - It is possible to list ephemeral snapshots by nodetool listsnaphots command when flag "-e" is specified. Upgrading --------- + - Emphemeral marker files for snapshots done by repairs are not created anymore, + there is a dedicated flag in snapshot manifest instead. On upgrade of a node to version 4.2, on node's start, in case there + are such ephemeral snapshots on disk, they will be deleted (same behaviour as before) and any new ephemeral snapshots + will stop to create ephemeral marker files as flag in a snapshot manifest was introduced instead. Deprecation ----------- diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java index a40e5c7ad1c8..94ca18084dfe 100644 --- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java +++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java @@ -22,7 +22,7 @@ import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.nio.ByteBuffer; -import java.nio.file.Files; +import java.nio.file.Path; import java.time.Instant; import java.util.ArrayList; import java.util.Arrays; @@ -155,6 +155,7 @@ import org.apache.cassandra.service.paxos.Ballot; import org.apache.cassandra.service.paxos.PaxosRepairHistory; import org.apache.cassandra.service.paxos.TablePaxosRepairHistory; +import org.apache.cassandra.service.snapshot.SnapshotLoader; import org.apache.cassandra.service.snapshot.SnapshotManifest; import org.apache.cassandra.service.snapshot.TableSnapshot; import org.apache.cassandra.streaming.TableStreamManager; @@ -1698,9 +1699,9 @@ public CompactionManager.AllSSTableOpStatus verify(Verifier.Options options) thr /** * Rewrites all SSTables according to specified parameters * - * @param skipIfCurrentVersion - if {@link true}, will rewrite only SSTables that have version older than the current one ({@link BigFormat#latestVersion}) + * @param skipIfCurrentVersion - if {@link true}, will rewrite only SSTables that have version older than the current one ({@link org.apache.cassandra.io.sstable.format.big.BigFormat#latestVersion}) * @param skipIfNewerThanTimestamp - max timestamp (local creation time) for SSTable; SSTables created _after_ this timestamp will be excluded from compaction - * @param skipIfCompressionMatches - if {@link true}, will rewrite only SSTables whose compression parameters are different from {@link CFMetaData#compressionParams()} + * @param skipIfCompressionMatches - if {@link true}, will rewrite only SSTables whose compression parameters are different from {@link TableMetadata#params#getCompressionParameters()} ()} * @param jobs number of jobs for parallel execution */ public CompactionManager.AllSSTableOpStatus sstablesRewrite(final boolean skipIfCurrentVersion, @@ -2039,7 +2040,7 @@ protected TableSnapshot createSnapshot(String tag, boolean ephemeral, DurationSp .collect(Collectors.toCollection(HashSet::new)); // Create and write snapshot manifest - SnapshotManifest manifest = new SnapshotManifest(mapToDataFilenames(sstables), ttl, creationTime); + SnapshotManifest manifest = new SnapshotManifest(mapToDataFilenames(sstables), ttl, creationTime, ephemeral); File manifestFile = getDirectories().getSnapshotManifestFile(tag); writeSnapshotManifest(manifest, manifestFile); snapshotDirs.add(manifestFile.parent().toAbsolute()); // manifest may create empty snapshot dir @@ -2052,16 +2053,9 @@ protected TableSnapshot createSnapshot(String tag, boolean ephemeral, DurationSp snapshotDirs.add(schemaFile.parent().toAbsolute()); // schema may create empty snapshot dir } - // Maybe create ephemeral marker - if (ephemeral) - { - File ephemeralSnapshotMarker = getDirectories().getNewEphemeralSnapshotMarkerFile(tag); - createEphemeralSnapshotMarkerFile(tag, ephemeralSnapshotMarker); - snapshotDirs.add(ephemeralSnapshotMarker.parent().toAbsolute()); // marker may create empty snapshot dir - } - - TableSnapshot snapshot = new TableSnapshot(metadata.keyspace, metadata.name, metadata.id.asUUID(), tag, - manifest.createdAt, manifest.expiresAt, snapshotDirs); + TableSnapshot snapshot = new TableSnapshot(metadata.keyspace, metadata.name, metadata.id.asUUID(), + tag, manifest.createdAt, manifest.expiresAt, snapshotDirs, + manifest.ephemeral); StorageService.instance.addSnapshot(snapshot); return snapshot; @@ -2106,34 +2100,19 @@ private void writeSnapshotSchema(File schemaFile) } } - private void createEphemeralSnapshotMarkerFile(final String snapshot, File ephemeralSnapshotMarker) - { - try - { - if (!ephemeralSnapshotMarker.parent().exists()) - ephemeralSnapshotMarker.parent().tryCreateDirectories(); - - Files.createFile(ephemeralSnapshotMarker.toPath()); - if (logger.isTraceEnabled()) - logger.trace("Created ephemeral snapshot marker file on {}.", ephemeralSnapshotMarker.absolutePath()); - } - catch (IOException e) - { - logger.warn(String.format("Could not create marker file %s for ephemeral snapshot %s. " + - "In case there is a failure in the operation that created " + - "this snapshot, you may need to clean it manually afterwards.", - ephemeralSnapshotMarker.absolutePath(), snapshot), e); - } - } - protected static void clearEphemeralSnapshots(Directories directories) { RateLimiter clearSnapshotRateLimiter = DatabaseDescriptor.getSnapshotRateLimiter(); - for (String ephemeralSnapshot : directories.listEphemeralSnapshots()) + List ephemeralSnapshots = new SnapshotLoader(directories).loadSnapshots() + .stream() + .filter(TableSnapshot::isEphemeral) + .collect(Collectors.toList()); + + for (TableSnapshot ephemeralSnapshot : ephemeralSnapshots) { - logger.trace("Clearing ephemeral snapshot {} leftover from previous session.", ephemeralSnapshot); - Directories.clearSnapshot(ephemeralSnapshot, directories.getCFDirectories(), clearSnapshotRateLimiter); + logger.trace("Clearing ephemeral snapshot {} leftover from previous session.", ephemeralSnapshot.getId()); + Directories.clearSnapshot(ephemeralSnapshot.getTag(), directories.getCFDirectories(), clearSnapshotRateLimiter); } } diff --git a/src/java/org/apache/cassandra/db/Directories.java b/src/java/org/apache/cassandra/db/Directories.java index 8672377b7517..97f0fecb8baf 100644 --- a/src/java/org/apache/cassandra/db/Directories.java +++ b/src/java/org/apache/cassandra/db/Directories.java @@ -572,17 +572,6 @@ public static File getSnapshotSchemaFile(File snapshotDir) return new File(snapshotDir, "schema.cql"); } - public File getNewEphemeralSnapshotMarkerFile(String snapshotName) - { - File snapshotDir = new File(getWriteableLocationAsFile(1L), join(SNAPSHOT_SUBDIR, snapshotName)); - return getEphemeralSnapshotMarkerFile(snapshotDir); - } - - private static File getEphemeralSnapshotMarkerFile(File snapshotDirectory) - { - return new File(snapshotDirectory, "ephemeral.snapshot"); - } - public static File getBackupsDirectory(Descriptor desc) { return getBackupsDirectory(desc.directory); @@ -983,18 +972,25 @@ public Map listSnapshots() return snapshots; } - protected TableSnapshot buildSnapshot(String tag, SnapshotManifest manifest, Set snapshotDirs) { + private TableSnapshot buildSnapshot(String tag, SnapshotManifest manifest, Set snapshotDirs) + { + boolean ephemeral = manifest != null ? manifest.isEphemeral() : isLegacyEphemeralSnapshot(snapshotDirs); Instant createdAt = manifest == null ? null : manifest.createdAt; Instant expiresAt = manifest == null ? null : manifest.expiresAt; return new TableSnapshot(metadata.keyspace, metadata.name, metadata.id.asUUID(), tag, createdAt, expiresAt, - snapshotDirs); + snapshotDirs, ephemeral); + } + + private static boolean isLegacyEphemeralSnapshot(Set snapshotDirs) + { + return snapshotDirs.stream().map(d -> new File(d, "ephemeral.snapshot")).anyMatch(File::exists); } @VisibleForTesting protected static SnapshotManifest maybeLoadManifest(String keyspace, String table, String tag, Set snapshotDirs) { List manifests = snapshotDirs.stream().map(d -> new File(d, "manifest.json")) - .filter(d -> d.exists()).collect(Collectors.toList()); + .filter(File::exists).collect(Collectors.toList()); if (manifests.isEmpty()) { @@ -1018,42 +1014,6 @@ protected static SnapshotManifest maybeLoadManifest(String keyspace, String tabl return null; } - public List listEphemeralSnapshots() - { - final List ephemeralSnapshots = new LinkedList<>(); - for (File snapshot : listAllSnapshots()) - { - if (getEphemeralSnapshotMarkerFile(snapshot).exists()) - ephemeralSnapshots.add(snapshot.name()); - } - return ephemeralSnapshots; - } - - private List listAllSnapshots() - { - final List snapshots = new LinkedList<>(); - for (final File dir : dataPaths) - { - File snapshotDir = isSecondaryIndexFolder(dir) - ? new File(dir.parentPath(), SNAPSHOT_SUBDIR) - : new File(dir, SNAPSHOT_SUBDIR); - if (snapshotDir.exists() && snapshotDir.isDirectory()) - { - final File[] snapshotDirs = snapshotDir.tryList(); - if (snapshotDirs != null) - { - for (final File snapshot : snapshotDirs) - { - if (snapshot.isDirectory()) - snapshots.add(snapshot); - } - } - } - } - - return snapshots; - } - @VisibleForTesting protected Map> listSnapshotDirsByTag() { diff --git a/src/java/org/apache/cassandra/db/SnapshotDetailsTabularData.java b/src/java/org/apache/cassandra/db/SnapshotDetailsTabularData.java index 4e6ab116c469..c091ebd5e00b 100644 --- a/src/java/org/apache/cassandra/db/SnapshotDetailsTabularData.java +++ b/src/java/org/apache/cassandra/db/SnapshotDetailsTabularData.java @@ -32,7 +32,8 @@ public class SnapshotDetailsTabularData "True size", "Size on disk", "Creation time", - "Expiration time",}; + "Expiration time", + "Ephemeral"}; private static final String[] ITEM_DESCS = new String[]{"snapshot_name", "keyspace_name", @@ -40,7 +41,8 @@ public class SnapshotDetailsTabularData "TrueDiskSpaceUsed", "TotalDiskSpaceUsed", "created_at", - "expires_at",}; + "expires_at", + "ephemeral"}; private static final String TYPE_NAME = "SnapshotDetails"; @@ -56,7 +58,7 @@ public class SnapshotDetailsTabularData { try { - ITEM_TYPES = new OpenType[]{ SimpleType.STRING, SimpleType.STRING, SimpleType.STRING, SimpleType.STRING, SimpleType.STRING, SimpleType.STRING, SimpleType.STRING }; + ITEM_TYPES = new OpenType[]{ SimpleType.STRING, SimpleType.STRING, SimpleType.STRING, SimpleType.STRING, SimpleType.STRING, SimpleType.STRING, SimpleType.STRING, SimpleType.STRING }; COMPOSITE_TYPE = new CompositeType(TYPE_NAME, ROW_DESC, ITEM_NAMES, ITEM_DESCS, ITEM_TYPES); @@ -77,8 +79,9 @@ public static void from(TableSnapshot details, TabularDataSupport result) final String liveSize = FileUtils.stringifyFileSize(details.computeTrueSizeBytes()); String createdAt = safeToString(details.getCreatedAt()); String expiresAt = safeToString(details.getExpiresAt()); + String ephemeral = Boolean.toString(details.isEphemeral()); result.put(new CompositeDataSupport(COMPOSITE_TYPE, ITEM_NAMES, - new Object[]{ details.getTag(), details.getKeyspaceName(), details.getTableName(), liveSize, totalSize, createdAt, expiresAt })); + new Object[]{ details.getTag(), details.getKeyspaceName(), details.getTableName(), liveSize, totalSize, createdAt, expiresAt, ephemeral })); } catch (OpenDataException e) { diff --git a/src/java/org/apache/cassandra/service/CassandraDaemon.java b/src/java/org/apache/cassandra/service/CassandraDaemon.java index cca409385ab7..90b949613848 100644 --- a/src/java/org/apache/cassandra/service/CassandraDaemon.java +++ b/src/java/org/apache/cassandra/service/CassandraDaemon.java @@ -291,24 +291,13 @@ protected void setup() SSTableHeaderFix.fixNonFrozenUDTIfUpgradeFrom30(); - // clean up debris in the rest of the keyspaces - for (String keyspaceName : Schema.instance.getKeyspaces()) + try { - // Skip system as we've already cleaned it - if (keyspaceName.equals(SchemaConstants.SYSTEM_KEYSPACE_NAME)) - continue; - - for (TableMetadata cfm : Schema.instance.getTablesAndViews(keyspaceName)) - { - try - { - ColumnFamilyStore.scrubDataDirectories(cfm); - } - catch (StartupException e) - { - exitOrFail(e.returnCode, e.getMessage(), e.getCause()); - } - } + scrubDataDirectories(); + } + catch (StartupException e) + { + exitOrFail(e.returnCode, e.getMessage(), e.getCause()); } Keyspace.setInitialized(); @@ -579,6 +568,22 @@ public void setupVirtualKeyspaces() VirtualKeyspaceRegistry.instance.register(SystemViewsKeyspace.instance); } + public void scrubDataDirectories() throws StartupException + { + // clean up debris in the rest of the keyspaces + for (String keyspaceName : Schema.instance.getKeyspaces()) + { + // Skip system as we've already cleaned it + if (keyspaceName.equals(SchemaConstants.SYSTEM_KEYSPACE_NAME)) + continue; + + for (TableMetadata cfm : Schema.instance.getTablesAndViews(keyspaceName)) + { + ColumnFamilyStore.scrubDataDirectories(cfm); + } + } + } + public synchronized void initializeClientTransports() { // Native transport diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java index bf8c65a25342..957daf395a66 100644 --- a/src/java/org/apache/cassandra/service/StorageService.java +++ b/src/java/org/apache/cassandra/service/StorageService.java @@ -4136,6 +4136,7 @@ public void clearSnapshot(String tag, String... keyspaceNames) throws IOExceptio public Map getSnapshotDetails(Map options) { boolean skipExpiring = options != null && Boolean.parseBoolean(options.getOrDefault("no_ttl", "false")); + boolean includeEphemeral = options != null && Boolean.parseBoolean(options.getOrDefault("include_ephemeral", "false")); SnapshotLoader loader = new SnapshotLoader(); Map snapshotMap = new HashMap<>(); @@ -4144,6 +4145,8 @@ public Map getSnapshotDetails(Map options) { if (skipExpiring && snapshot.isExpiring()) continue; + if (!includeEphemeral && snapshot.isEphemeral()) + continue; TabularDataSupport data = (TabularDataSupport) snapshotMap.get(snapshot.getTag()); if (data == null) diff --git a/src/java/org/apache/cassandra/service/snapshot/SnapshotLoader.java b/src/java/org/apache/cassandra/service/snapshot/SnapshotLoader.java index 5f50937bf478..5c90007ddd08 100644 --- a/src/java/org/apache/cassandra/service/snapshot/SnapshotLoader.java +++ b/src/java/org/apache/cassandra/service/snapshot/SnapshotLoader.java @@ -74,6 +74,11 @@ public SnapshotLoader(Collection dataDirs) this.dataDirectories = dataDirs; } + public SnapshotLoader(Directories directories) + { + this(directories.getCFDirectories().stream().map(File::toPath).collect(Collectors.toList())); + } + public Set loadSnapshots() { for (Path dataDir : dataDirectories) diff --git a/src/java/org/apache/cassandra/service/snapshot/SnapshotManifest.java b/src/java/org/apache/cassandra/service/snapshot/SnapshotManifest.java index ba840efb77fa..4ac9bdc03dac 100644 --- a/src/java/org/apache/cassandra/service/snapshot/SnapshotManifest.java +++ b/src/java/org/apache/cassandra/service/snapshot/SnapshotManifest.java @@ -46,19 +46,25 @@ public class SnapshotManifest @JsonProperty("expires_at") public final Instant expiresAt; + @JsonProperty("ephemeral") + public final boolean ephemeral; + /** needed for jackson serialization */ @SuppressWarnings("unused") - private SnapshotManifest() { + private SnapshotManifest() + { this.files = null; this.createdAt = null; this.expiresAt = null; + this.ephemeral = false; } - public SnapshotManifest(List files, DurationSpec.IntSecondsBound ttl, Instant creationTime) + public SnapshotManifest(List files, DurationSpec.IntSecondsBound ttl, Instant creationTime, boolean ephemeral) { this.files = files; this.createdAt = creationTime; this.expiresAt = ttl == null ? null : createdAt.plusSeconds(ttl.toSeconds()); + this.ephemeral = ephemeral; } public List getFiles() @@ -76,6 +82,11 @@ public Instant getExpiresAt() return expiresAt; } + public boolean isEphemeral() + { + return ephemeral; + } + public void serializeToJsonFile(File outputFile) throws IOException { FBUtilities.serializeToJsonFile(this, outputFile); @@ -92,12 +103,15 @@ public boolean equals(Object o) if (this == o) return true; if (o == null || getClass() != o.getClass()) return false; SnapshotManifest manifest = (SnapshotManifest) o; - return Objects.equals(files, manifest.files) && Objects.equals(createdAt, manifest.createdAt) && Objects.equals(expiresAt, manifest.expiresAt); + return Objects.equals(files, manifest.files) + && Objects.equals(createdAt, manifest.createdAt) + && Objects.equals(expiresAt, manifest.expiresAt) + && Objects.equals(ephemeral, manifest.ephemeral); } @Override public int hashCode() { - return Objects.hash(files, createdAt, expiresAt); + return Objects.hash(files, createdAt, expiresAt, ephemeral); } } diff --git a/src/java/org/apache/cassandra/service/snapshot/TableSnapshot.java b/src/java/org/apache/cassandra/service/snapshot/TableSnapshot.java index 93fe2455c4d6..bda147fc8c92 100644 --- a/src/java/org/apache/cassandra/service/snapshot/TableSnapshot.java +++ b/src/java/org/apache/cassandra/service/snapshot/TableSnapshot.java @@ -45,6 +45,7 @@ public class TableSnapshot private final String tableName; private final UUID tableId; private final String tag; + private final boolean ephemeral; private final Instant createdAt; private final Instant expiresAt; @@ -53,7 +54,7 @@ public class TableSnapshot public TableSnapshot(String keyspaceName, String tableName, UUID tableId, String tag, Instant createdAt, Instant expiresAt, - Set snapshotDirs) + Set snapshotDirs, boolean ephemeral) { this.keyspaceName = keyspaceName; this.tableName = tableName; @@ -62,6 +63,7 @@ public TableSnapshot(String keyspaceName, String tableName, UUID tableId, this.createdAt = createdAt; this.expiresAt = expiresAt; this.snapshotDirs = snapshotDirs; + this.ephemeral = ephemeral; } /** @@ -124,6 +126,11 @@ public boolean exists() return snapshotDirs.stream().anyMatch(File::exists); } + public boolean isEphemeral() + { + return ephemeral; + } + public boolean isExpiring() { return expiresAt != null; @@ -193,13 +200,13 @@ public boolean equals(Object o) return Objects.equals(keyspaceName, snapshot.keyspaceName) && Objects.equals(tableName, snapshot.tableName) && Objects.equals(tableId, snapshot.tableId) && Objects.equals(tag, snapshot.tag) && Objects.equals(createdAt, snapshot.createdAt) && Objects.equals(expiresAt, snapshot.expiresAt) && - Objects.equals(snapshotDirs, snapshot.snapshotDirs); + Objects.equals(snapshotDirs, snapshot.snapshotDirs) && Objects.equals(ephemeral, snapshot.ephemeral); } @Override public int hashCode() { - return Objects.hash(keyspaceName, tableName, tableId, tag, createdAt, expiresAt, snapshotDirs); + return Objects.hash(keyspaceName, tableName, tableId, tag, createdAt, expiresAt, snapshotDirs, ephemeral); } @Override @@ -213,6 +220,7 @@ public String toString() ", createdAt=" + createdAt + ", expiresAt=" + expiresAt + ", snapshotDirs=" + snapshotDirs + + ", ephemeral=" + ephemeral + '}'; } @@ -224,6 +232,7 @@ static class Builder { private Instant createdAt = null; private Instant expiresAt = null; + private boolean ephemeral; private final Set snapshotDirs = new HashSet<>(); @@ -239,12 +248,17 @@ void addSnapshotDir(File snapshotDir) { snapshotDirs.add(snapshotDir); File manifestFile = new File(snapshotDir, "manifest.json"); - if (manifestFile.exists() && createdAt == null && expiresAt == null) { - loadTimestampsFromManifest(manifestFile); - } + if (manifestFile.exists() && createdAt == null && expiresAt == null) + loadMetadataFromManifest(manifestFile); + + // check if an ephemeral marker file exists only in case it is not already ephemeral + // by reading it from manifest + // TODO remove this on Cassandra 4.3 release, see CASSANDRA-16911 + if (!ephemeral && new File(snapshotDir, "ephemeral.snapshot").exists()) + ephemeral = true; } - private void loadTimestampsFromManifest(File manifestFile) + private void loadMetadataFromManifest(File manifestFile) { try { @@ -252,6 +266,9 @@ private void loadTimestampsFromManifest(File manifestFile) SnapshotManifest manifest = SnapshotManifest.deserializeFromJsonFile(manifestFile); createdAt = manifest.createdAt; expiresAt = manifest.expiresAt; + // a snapshot may be ephemeral when it has a marker file (old way) or flag in manifest (new way) + if (!ephemeral) + ephemeral = manifest.ephemeral; } catch (IOException e) { @@ -261,7 +278,7 @@ private void loadTimestampsFromManifest(File manifestFile) TableSnapshot build() { - return new TableSnapshot(keyspaceName, tableName, tableId, tag, createdAt, expiresAt, snapshotDirs); + return new TableSnapshot(keyspaceName, tableName, tableId, tag, createdAt, expiresAt, snapshotDirs, ephemeral); } } diff --git a/src/java/org/apache/cassandra/tools/nodetool/ListSnapshots.java b/src/java/org/apache/cassandra/tools/nodetool/ListSnapshots.java index b70a7a964c52..803fe5a4f0db 100644 --- a/src/java/org/apache/cassandra/tools/nodetool/ListSnapshots.java +++ b/src/java/org/apache/cassandra/tools/nodetool/ListSnapshots.java @@ -40,6 +40,11 @@ public class ListSnapshots extends NodeToolCmd description = "Skip snapshots with TTL") private boolean noTTL = false; + @Option(title = "ephemeral", + name = { "-e", "--ephemeral" }, + description = "Include ephememeral snapshots") + private boolean includeEphemeral = false; + @Override public void execute(NodeProbe probe) { @@ -50,6 +55,7 @@ public void execute(NodeProbe probe) Map options = new HashMap<>(); options.put("no_ttl", Boolean.toString(noTTL)); + options.put("include_ephemeral", Boolean.toString(includeEphemeral)); final Map snapshotDetails = probe.getSnapshotDetails(options); if (snapshotDetails.isEmpty()) @@ -62,7 +68,11 @@ public void execute(NodeProbe probe) TableBuilder table = new TableBuilder(); // display column names only once final List indexNames = snapshotDetails.entrySet().iterator().next().getValue().getTabularType().getIndexNames(); - table.add(indexNames.toArray(new String[indexNames.size()])); + + if (includeEphemeral) + table.add(indexNames.toArray(new String[indexNames.size()])); + else + table.add(indexNames.subList(0, indexNames.size() - 1).toArray(new String[indexNames.size() - 1])); for (final Map.Entry snapshotDetail : snapshotDetails.entrySet()) { @@ -70,12 +80,15 @@ public void execute(NodeProbe probe) for (Object eachValue : values) { final List value = (List) eachValue; - table.add(value.toArray(new String[value.size()])); + if (includeEphemeral) + table.add(value.toArray(new String[value.size()])); + else + table.add(value.subList(0, value.size() - 1).toArray(new String[value.size() - 1])); } } table.printTo(out); - out.println("\nTotal TrueDiskSpaceUsed: " + FileUtils.stringifyFileSize(trueSnapshotsSize) + "\n"); + out.println("\nTotal TrueDiskSpaceUsed: " + FileUtils.stringifyFileSize(trueSnapshotsSize) + '\n'); } catch (Exception e) { diff --git a/test/distributed/org/apache/cassandra/distributed/impl/Instance.java b/test/distributed/org/apache/cassandra/distributed/impl/Instance.java index 65e65794807c..705a7f328959 100644 --- a/test/distributed/org/apache/cassandra/distributed/impl/Instance.java +++ b/test/distributed/org/apache/cassandra/distributed/impl/Instance.java @@ -114,6 +114,7 @@ import org.apache.cassandra.schema.MigrationCoordinator; import org.apache.cassandra.schema.Schema; import org.apache.cassandra.schema.SchemaConstants; +import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.service.ActiveRepairService; import org.apache.cassandra.service.CassandraDaemon; import org.apache.cassandra.service.ClientState; @@ -619,6 +620,9 @@ public void startup(ICluster cluster) // Start up virtual table support CassandraDaemon.getInstanceForTesting().setupVirtualKeyspaces(); + // clean up debris in data directories + CassandraDaemon.getInstanceForTesting().scrubDataDirectories(); + Keyspace.setInitialized(); // Replay any CommitLogSegments found on disk diff --git a/test/distributed/org/apache/cassandra/distributed/test/EphemeralSnapshotTest.java b/test/distributed/org/apache/cassandra/distributed/test/EphemeralSnapshotTest.java new file mode 100644 index 000000000000..a9e804d071e3 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/EphemeralSnapshotTest.java @@ -0,0 +1,164 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test; + +import java.nio.file.Files; +import java.nio.file.Path; +import java.nio.file.Paths; + +import com.google.common.util.concurrent.Futures; +import org.junit.Test; + +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.IIsolatedExecutor; +import org.apache.cassandra.io.util.File; +import org.apache.cassandra.service.snapshot.SnapshotManifest; +import org.apache.cassandra.utils.Pair; + +import static java.lang.String.format; +import static java.util.concurrent.TimeUnit.SECONDS; +import static org.apache.cassandra.distributed.api.ConsistencyLevel.ONE; +import static org.apache.cassandra.distributed.api.Feature.GOSSIP; +import static org.apache.cassandra.distributed.api.Feature.NATIVE_PROTOCOL; +import static org.apache.cassandra.distributed.api.Feature.NETWORK; +import static org.awaitility.Awaitility.await; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; + +public class EphemeralSnapshotTest extends TestBaseImpl +{ + private static final String snapshotName = "snapshotname"; + private static final String tableName = "city"; + + @Test + public void testStartupRemovesEphemeralSnapshotOnEphemeralFlagInManifest() throws Exception + { + try (Cluster c = init(builder().withNodes(1) + .withConfig(config -> config.with(GOSSIP, NETWORK, NATIVE_PROTOCOL)) + .start())) + { + Pair initialisationData = initialise(c); + + String tableId = initialisationData.left; + String[] dataDirs = initialisationData.right; + + // rewrite manifest, pretend that it is ephemeral + Path manifestPath = findManifest(dataDirs, tableId); + SnapshotManifest manifest = SnapshotManifest.deserializeFromJsonFile(new File(manifestPath)); + SnapshotManifest manifestWithEphemeralFlag = new SnapshotManifest(manifest.files, null, manifest.createdAt, true); + manifestWithEphemeralFlag.serializeToJsonFile(new File(manifestPath)); + + verify(c.get(1)); + } + } + + // TODO this test might be deleted once we get rid of ephemeral marker file for good in 4.3 + @Test + public void testStartupRemovesEphemeralSnapshotOnMarkerFile() throws Exception + { + try (Cluster c = init(builder().withNodes(1) + .withConfig(config -> config.with(GOSSIP, NETWORK, NATIVE_PROTOCOL)) + .start())) + { + Pair initialisationData = initialise(c); + + String tableId = initialisationData.left; + String[] dataDirs = initialisationData.right; + + // place ephemeral marker file into snapshot directory pretending it was created as ephemeral + Path ephemeralMarkerFile = Paths.get(dataDirs[0]) + .resolve(KEYSPACE) + .resolve(format("%s-%s", tableName, tableId)) + .resolve("snapshots") + .resolve(snapshotName) + .resolve("ephemeral.snapshot"); + + Files.createFile(ephemeralMarkerFile); + + verify(c.get(1)); + } + } + + private Pair initialise(Cluster c) + { + c.schemaChange(withKeyspace("CREATE TABLE IF NOT EXISTS %s." + tableName + " (cityid int PRIMARY KEY, name text)")); + c.coordinator(1).execute(withKeyspace("INSERT INTO %s." + tableName + "(cityid, name) VALUES (1, 'Canberra');"), ONE); + IInvokableInstance instance = c.get(1); + + instance.flush(KEYSPACE); + + assertEquals(0, instance.nodetool("snapshot", "-kt", withKeyspace("%s." + tableName), "-t", snapshotName)); + waitForSnapshot(instance, snapshotName); + + String tableId = instance.callOnInstance((IIsolatedExecutor.SerializableCallable) () -> { + return Keyspace.open(KEYSPACE).getMetadata().tables.get(tableName).get().id.asUUID().toString().replaceAll("-", ""); + }); + + String[] dataDirs = (String[]) instance.config().get("data_file_directories"); + + return Pair.create(tableId, dataDirs); + } + + + private void verify(IInvokableInstance instance) + { + // by default, we do not see ephemerals + assertFalse(instance.nodetoolResult("listsnapshots").getStdout().contains("snapshotname")); + + // we see them via -e flag + assertTrue(instance.nodetoolResult("listsnapshots", "-e").getStdout().contains("snapshotname")); + + Futures.getUnchecked(instance.shutdown()); + + // startup should remove ephemeral marker file + instance.startup(); + + assertFalse(instance.nodetoolResult("listsnapshots", "-e").getStdout().contains("snapshotname")); + } + + private void waitForSnapshot(IInvokableInstance instance, String snapshotName) + { + await().timeout(20, SECONDS) + .pollInterval(1, SECONDS) + .until(() -> instance.nodetoolResult("listsnapshots", "-e").getStdout().contains(snapshotName)); + } + + private Path findManifest(String[] dataDirs, String tableId) + { + for (String dataDir : dataDirs) + { + Path manifest = Paths.get(dataDir) + .resolve(KEYSPACE) + .resolve(format("%s-%s", tableName, tableId)) + .resolve("snapshots") + .resolve(snapshotName) + .resolve("manifest.json"); + + if (Files.exists(manifest)) + { + return manifest; + } + } + + throw new IllegalStateException("Unable to find manifest!"); + } +} diff --git a/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java b/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java index 6d8bd0fe0576..6268d3b5ed29 100644 --- a/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java +++ b/test/unit/org/apache/cassandra/db/ColumnFamilyStoreTest.java @@ -170,7 +170,7 @@ public void runMayThrow() throws IOException } @Test - public void testDeleteStandardRowSticksAfterFlush() throws Throwable + public void testDeleteStandardRowSticksAfterFlush() { // test to make sure flushing after a delete doesn't resurrect delted cols. String keyspaceName = KEYSPACE1; @@ -228,7 +228,7 @@ public void testDeleteStandardRowSticksAfterFlush() throws Throwable } @Test - public void testClearEphemeralSnapshots() throws Throwable + public void testClearEphemeralSnapshots() { ColumnFamilyStore cfs = Keyspace.open(KEYSPACE1).getColumnFamilyStore(CF_INDEX1); diff --git a/test/unit/org/apache/cassandra/db/DirectoriesTest.java b/test/unit/org/apache/cassandra/db/DirectoriesTest.java index 252c6306147c..8e47157af3d1 100644 --- a/test/unit/org/apache/cassandra/db/DirectoriesTest.java +++ b/test/unit/org/apache/cassandra/db/DirectoriesTest.java @@ -20,6 +20,7 @@ import java.io.IOException; import java.nio.file.Files; import java.nio.file.Path; +import java.nio.file.Paths; import java.time.Instant; import java.util.ArrayList; import java.util.Arrays; @@ -97,6 +98,7 @@ public class DirectoriesTest public static final String TABLE_NAME = "FakeTable"; public static final String SNAPSHOT1 = "snapshot1"; public static final String SNAPSHOT2 = "snapshot2"; + public static final String SNAPSHOT3 = "snapshot3"; public static final String LEGACY_SNAPSHOT_NAME = "42"; private static File tempDataDir; @@ -105,7 +107,7 @@ public class DirectoriesTest private static Set CFM; private static Map> sstablesByTableName; - @Parameterized.Parameter(0) + @Parameterized.Parameter public SSTableId.Builder idBuilder; @Parameterized.Parameter(1) @@ -151,7 +153,7 @@ public void beforeTest() throws IOException @AfterClass public static void afterClass() { - FileUtils.deleteRecursive(tempDataDir); + tempDataDir.deleteRecursive(); } private static DataDirectory[] toDataDirectories(File location) @@ -159,7 +161,7 @@ private static DataDirectory[] toDataDirectories(File location) return new DataDirectory[] { new DataDirectory(location) }; } - private void createTestFiles() throws IOException + private void createTestFiles() { for (TableMetadata cfm : CFM) { @@ -181,25 +183,27 @@ private void createTestFiles() throws IOException } } - class FakeSnapshot { + static class FakeSnapshot { final TableMetadata table; final String tag; final File snapshotDir; final SnapshotManifest manifest; + final boolean ephemeral; - FakeSnapshot(TableMetadata table, String tag, File snapshotDir, SnapshotManifest manifest) + FakeSnapshot(TableMetadata table, String tag, File snapshotDir, SnapshotManifest manifest, boolean ephemeral) { this.table = table; this.tag = tag; this.snapshotDir = snapshotDir; this.manifest = manifest; + this.ephemeral = ephemeral; } public TableSnapshot asTableSnapshot() { Instant createdAt = manifest == null ? null : manifest.createdAt; Instant expiresAt = manifest == null ? null : manifest.expiresAt; - return new TableSnapshot(table.keyspace, table.name, table.id.asUUID(), tag, createdAt, expiresAt, Collections.singleton(snapshotDir)); + return new TableSnapshot(table.keyspace, table.name, table.id.asUUID(), tag, createdAt, expiresAt, Collections.singleton(snapshotDir), ephemeral); } } @@ -211,7 +215,7 @@ private TableMetadata createFakeTable(String table) .build(); } - public FakeSnapshot createFakeSnapshot(TableMetadata table, String tag, boolean createManifest) throws IOException + public FakeSnapshot createFakeSnapshot(TableMetadata table, String tag, boolean createManifest, boolean ephemeral) throws IOException { File tableDir = cfDir(table); tableDir.tryCreateDirectories(); @@ -225,11 +229,15 @@ public FakeSnapshot createFakeSnapshot(TableMetadata table, String tag, boolean if (createManifest) { File manifestFile = Directories.getSnapshotManifestFile(snapshotDir); - manifest = new SnapshotManifest(Collections.singletonList(sstableDesc.filenameFor(Component.DATA)), new DurationSpec.IntSecondsBound("1m"), now()); + manifest = new SnapshotManifest(Collections.singletonList(sstableDesc.filenameFor(Component.DATA)), new DurationSpec.IntSecondsBound("1m"), now(), ephemeral); manifest.serializeToJsonFile(manifestFile); } + else if (ephemeral) + { + Files.createFile(snapshotDir.toPath().resolve("ephemeral.snapshot")); + } - return new FakeSnapshot(table, tag, snapshotDir, manifest); + return new FakeSnapshot(table, tag, snapshotDir, manifest, ephemeral); } private List createFakeSSTable(File dir, String cf, int gen) @@ -269,7 +277,7 @@ private static File cfDir(TableMetadata metadata) } @Test - public void testStandardDirs() throws IOException + public void testStandardDirs() { for (TableMetadata cfm : CFM) { @@ -296,22 +304,27 @@ public void testListSnapshots() throws Exception { assertThat(directories.listSnapshots()).isEmpty(); // Create snapshot with and without manifest - FakeSnapshot snapshot1 = createFakeSnapshot(fakeTable, SNAPSHOT1, true); - FakeSnapshot snapshot2 = createFakeSnapshot(fakeTable, SNAPSHOT2, false); + FakeSnapshot snapshot1 = createFakeSnapshot(fakeTable, SNAPSHOT1, true, false); + FakeSnapshot snapshot2 = createFakeSnapshot(fakeTable, SNAPSHOT2, false, false); + // ephemeral without manifst + FakeSnapshot snapshot3 = createFakeSnapshot(fakeTable, SNAPSHOT3, false, true); // Both snapshots should be present Map snapshots = directories.listSnapshots(); - assertThat(snapshots.keySet()).isEqualTo(Sets.newHashSet(SNAPSHOT1, SNAPSHOT2)); + assertThat(snapshots.keySet()).isEqualTo(Sets.newHashSet(SNAPSHOT1, SNAPSHOT2, SNAPSHOT3)); assertThat(snapshots.get(SNAPSHOT1)).isEqualTo(snapshot1.asTableSnapshot()); assertThat(snapshots.get(SNAPSHOT2)).isEqualTo(snapshot2.asTableSnapshot()); + assertThat(snapshots.get(SNAPSHOT3)).isEqualTo(snapshot3.asTableSnapshot()); // Now remove snapshot1 - FileUtils.deleteRecursive(snapshot1.snapshotDir); + snapshot1.snapshotDir.deleteRecursive(); - // Only snapshot 2 should be present + // Only snapshot 2 and 3 should be present snapshots = directories.listSnapshots(); - assertThat(snapshots.keySet()).isEqualTo(Sets.newHashSet(SNAPSHOT2)); + assertThat(snapshots.keySet()).isEqualTo(Sets.newHashSet(SNAPSHOT2, SNAPSHOT3)); assertThat(snapshots.get(SNAPSHOT2)).isEqualTo(snapshot2.asTableSnapshot()); + assertThat(snapshots.get(SNAPSHOT3)).isEqualTo(snapshot3.asTableSnapshot()); + assertThat(snapshots.get(SNAPSHOT3).isEphemeral()).isTrue(); } @Test @@ -322,21 +335,23 @@ public void testListSnapshotDirsByTag() throws Exception { assertThat(directories.listSnapshotDirsByTag()).isEmpty(); // Create snapshot with and without manifest - FakeSnapshot snapshot1 = createFakeSnapshot(fakeTable, SNAPSHOT1, true); - FakeSnapshot snapshot2 = createFakeSnapshot(fakeTable, SNAPSHOT2, false); + FakeSnapshot snapshot1 = createFakeSnapshot(fakeTable, SNAPSHOT1, true, false); + FakeSnapshot snapshot2 = createFakeSnapshot(fakeTable, SNAPSHOT2, false, false); + FakeSnapshot snapshot3 = createFakeSnapshot(fakeTable, SNAPSHOT3, false, true); // Both snapshots should be present Map> snapshotDirs = directories.listSnapshotDirsByTag(); - assertThat(snapshotDirs.keySet()).isEqualTo(Sets.newHashSet(SNAPSHOT1, SNAPSHOT2)); + assertThat(snapshotDirs.keySet()).isEqualTo(Sets.newHashSet(SNAPSHOT1, SNAPSHOT2, SNAPSHOT3)); assertThat(snapshotDirs.get(SNAPSHOT1)).allMatch(snapshotDir -> snapshotDir.equals(snapshot1.snapshotDir)); assertThat(snapshotDirs.get(SNAPSHOT2)).allMatch(snapshotDir -> snapshotDir.equals(snapshot2.snapshotDir)); + assertThat(snapshotDirs.get(SNAPSHOT3)).allMatch(snapshotDir -> snapshotDir.equals(snapshot3.snapshotDir)); // Now remove snapshot1 - FileUtils.deleteRecursive(snapshot1.snapshotDir); + snapshot1.snapshotDir.deleteRecursive(); - // Only snapshot 2 should be present + // Only snapshot 2 and 3 should be present snapshotDirs = directories.listSnapshotDirsByTag(); - assertThat(snapshotDirs.keySet()).isEqualTo(Sets.newHashSet(SNAPSHOT2)); + assertThat(snapshotDirs.keySet()).isEqualTo(Sets.newHashSet(SNAPSHOT2, SNAPSHOT3)); } @Test @@ -353,7 +368,7 @@ public void testMaybeManifestLoading() throws Exception { File manifestFile = directories.getSnapshotManifestFile(tag); - SnapshotManifest manifest = new SnapshotManifest(files, new DurationSpec.IntSecondsBound("1m"), now()); + SnapshotManifest manifest = new SnapshotManifest(files, new DurationSpec.IntSecondsBound("1m"), now(), false); manifest.serializeToJsonFile(manifestFile); Set dirs = new HashSet<>(); @@ -488,7 +503,7 @@ else if (f.name().contains("tmp-")) } @Test - public void testTemporaryFile() throws IOException + public void testTemporaryFile() { for (TableMetadata cfm : CFM) { @@ -552,11 +567,10 @@ public void testMTSnapshots() throws Exception final Directories directories = new Directories(cfm, toDataDirectories(tempDataDir)); assertEquals(cfDir(cfm), directories.getDirectoryForNewSSTables()); final String n = Long.toString(nanoTime()); - Callable directoryGetter = new Callable() { - public File call() throws Exception { - Descriptor desc = new Descriptor(cfDir(cfm), KS, cfm.name, sstableId(1), SSTableFormat.Type.BIG); - return Directories.getSnapshotDirectory(desc, n); - } + Callable directoryGetter = () -> + { + Descriptor desc = new Descriptor(cfDir(cfm), KS, cfm.name, sstableId(1), SSTableFormat.Type.BIG); + return Directories.getSnapshotDirectory(desc, n); }; List> invoked = Executors.newFixedThreadPool(2).invokeAll(Arrays.asList(directoryGetter, directoryGetter)); for(Future fut:invoked) { diff --git a/test/unit/org/apache/cassandra/service/snapshot/SnapshotLoaderTest.java b/test/unit/org/apache/cassandra/service/snapshot/SnapshotLoaderTest.java index 21ab7d9b1162..a4a6ad2bad8d 100644 --- a/test/unit/org/apache/cassandra/service/snapshot/SnapshotLoaderTest.java +++ b/test/unit/org/apache/cassandra/service/snapshot/SnapshotLoaderTest.java @@ -27,6 +27,7 @@ import java.util.UUID; import java.util.concurrent.ThreadLocalRandom; +import org.junit.Assert; import org.junit.ClassRule; import org.junit.Test; import org.junit.rules.TemporaryFolder; @@ -123,9 +124,40 @@ public void testSnapshotsWithoutManifests() throws IOException Paths.get(baseDir.toString(), DATA_DIR_3))); Set snapshots = loader.loadSnapshots(); assertThat(snapshots).hasSize(3); - assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_1, TABLE1_NAME, TABLE1_ID, TAG1, null, null, tag1Files)); - assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_1, TABLE2_NAME, TABLE2_ID, TAG2, null, null, tag2Files)); - assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_2, TABLE3_NAME, TABLE3_ID, TAG3, null, null, tag3Files)); + assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_1, TABLE1_NAME, TABLE1_ID, TAG1, null, null, tag1Files, false)); + assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_1, TABLE2_NAME, TABLE2_ID, TAG2, null, null, tag2Files, false)); + assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_2, TABLE3_NAME, TABLE3_ID, TAG3, null, null, tag3Files, false)); + } + + @Test + public void testEphemeralSnapshotWithoutManifest() throws IOException + { + Set tag1Files = new HashSet<>(); + + // Create one snapshot per table - without manifests: + // - ks1.t1 : tag1 + File baseDir = new File(tmpDir.newFolder()); + boolean ephemeralFileCreated = false; + for (String dataDir : DATA_DIRS) + { + File dir = createDir(baseDir, dataDir, KEYSPACE_1, tableDirName(TABLE1_NAME, TABLE1_ID), Directories.SNAPSHOT_SUBDIR, TAG1); + tag1Files.add(dir); + if (!ephemeralFileCreated) + { + createEphemeralMarkerFile(dir); + ephemeralFileCreated = true; + } + } + + // Verify snapshot is found correctly from data directories + SnapshotLoader loader = new SnapshotLoader(Arrays.asList(Paths.get(baseDir.toString(), DATA_DIR_1), + Paths.get(baseDir.toString(), DATA_DIR_2), + Paths.get(baseDir.toString(), DATA_DIR_3))); + + Set snapshots = loader.loadSnapshots(); + assertThat(snapshots).hasSize(1); + assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_1, TABLE1_NAME, TABLE1_ID, TAG1, null, null, tag1Files, true)); + Assert.assertTrue(snapshots.stream().findFirst().get().isEphemeral()); } @Test @@ -169,9 +201,9 @@ public void testSnapshotsWithManifests() throws IOException Paths.get(baseDir.toString(), DATA_DIR_3))); Set snapshots = loader.loadSnapshots(); assertThat(snapshots).hasSize(3); - assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_1, TABLE1_NAME, TABLE1_ID, TAG1, tag1Ts, null, tag1Files)); - assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_1, TABLE2_NAME, TABLE2_ID, TAG2, tag2Ts, tag2Ts.plusSeconds(tag2Ttl.toSeconds()), tag2Files)); - assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_2, TABLE3_NAME, TABLE3_ID, TAG3, tag3Ts, null, tag3Files)); + assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_1, TABLE1_NAME, TABLE1_ID, TAG1, tag1Ts, null, tag1Files, false)); + assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_1, TABLE2_NAME, TABLE2_ID, TAG2, tag2Ts, tag2Ts.plusSeconds(tag2Ttl.toSeconds()), tag2Files, false)); + assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_2, TABLE3_NAME, TABLE3_ID, TAG3, tag3Ts, null, tag3Files, false)); } @Test @@ -208,7 +240,7 @@ public void testParseUUID() private void writeManifest(File snapshotDir, Instant creationTime, DurationSpec.IntSecondsBound ttl) throws IOException { - SnapshotManifest manifest = new SnapshotManifest(Lists.newArrayList("f1", "f2", "f3"), ttl, creationTime); + SnapshotManifest manifest = new SnapshotManifest(Lists.newArrayList("f1", "f2", "f3"), ttl, creationTime, false); manifest.serializeToJsonFile(getManifestFile(snapshotDir)); } @@ -219,6 +251,11 @@ private static File createDir(File baseDir, String... subdirs) return file; } + private static void createEphemeralMarkerFile(File dir) + { + Assert.assertTrue(new File(dir, "ephemeral.snapshot").createFileIfNotExists()); + } + static String tableDirName(String tableName, UUID tableId) { return String.format("%s-%s", tableName, removeDashes(tableId)); diff --git a/test/unit/org/apache/cassandra/service/snapshot/SnapshotManagerTest.java b/test/unit/org/apache/cassandra/service/snapshot/SnapshotManagerTest.java index 0c078d47da15..eeb3b63fc3a8 100644 --- a/test/unit/org/apache/cassandra/service/snapshot/SnapshotManagerTest.java +++ b/test/unit/org/apache/cassandra/service/snapshot/SnapshotManagerTest.java @@ -50,7 +50,7 @@ public static void beforeClass() @ClassRule public static TemporaryFolder temporaryFolder = new TemporaryFolder(); - private TableSnapshot generateSnapshotDetails(String tag, Instant expiration) throws Exception { + private TableSnapshot generateSnapshotDetails(String tag, Instant expiration, boolean ephemeral) throws Exception { return new TableSnapshot( "ks", "tbl", @@ -58,15 +58,16 @@ private TableSnapshot generateSnapshotDetails(String tag, Instant expiration) th tag, Instant.EPOCH, expiration, - createFolders(temporaryFolder) + createFolders(temporaryFolder), + ephemeral ); } @Test public void testLoadSnapshots() throws Exception { - TableSnapshot expired = generateSnapshotDetails("expired", Instant.EPOCH); - TableSnapshot nonExpired = generateSnapshotDetails("non-expired", now().plusSeconds(ONE_DAY_SECS)); - TableSnapshot nonExpiring = generateSnapshotDetails("non-expiring", null); + TableSnapshot expired = generateSnapshotDetails("expired", Instant.EPOCH, false); + TableSnapshot nonExpired = generateSnapshotDetails("non-expired", now().plusSeconds(ONE_DAY_SECS), false); + TableSnapshot nonExpiring = generateSnapshotDetails("non-expiring", null, false); List snapshots = Arrays.asList(expired, nonExpired, nonExpiring); // Create SnapshotManager with 3 snapshots: expired, non-expired and non-expiring @@ -84,9 +85,9 @@ public void testClearExpiredSnapshots() throws Exception { SnapshotManager manager = new SnapshotManager(3, 3); // Add 3 snapshots: expired, non-expired and non-expiring - TableSnapshot expired = generateSnapshotDetails("expired", Instant.EPOCH); - TableSnapshot nonExpired = generateSnapshotDetails("non-expired", now().plusMillis(ONE_DAY_SECS)); - TableSnapshot nonExpiring = generateSnapshotDetails("non-expiring", null); + TableSnapshot expired = generateSnapshotDetails("expired", Instant.EPOCH, false); + TableSnapshot nonExpired = generateSnapshotDetails("non-expired", now().plusMillis(ONE_DAY_SECS), false); + TableSnapshot nonExpiring = generateSnapshotDetails("non-expiring", null, false); manager.addSnapshot(expired); manager.addSnapshot(nonExpired); manager.addSnapshot(nonExpiring); @@ -118,8 +119,8 @@ public void testScheduledCleanup() throws Exception { // Add 2 expiring snapshots: one to expire in 2 seconds, another in 1 day int TTL_SECS = 2; - TableSnapshot toExpire = generateSnapshotDetails("to-expire", now().plusSeconds(TTL_SECS)); - TableSnapshot nonExpired = generateSnapshotDetails("non-expired", now().plusMillis(ONE_DAY_SECS)); + TableSnapshot toExpire = generateSnapshotDetails("to-expire", now().plusSeconds(TTL_SECS), false); + TableSnapshot nonExpired = generateSnapshotDetails("non-expired", now().plusMillis(ONE_DAY_SECS), false); manager.addSnapshot(toExpire); manager.addSnapshot(nonExpired); @@ -150,7 +151,7 @@ public void testClearSnapshot() throws Exception { // Given SnapshotManager manager = new SnapshotManager(1, 3); - TableSnapshot expiringSnapshot = generateSnapshotDetails("snapshot", now().plusMillis(50000)); + TableSnapshot expiringSnapshot = generateSnapshotDetails("snapshot", now().plusMillis(50000), false); manager.addSnapshot(expiringSnapshot); assertThat(manager.getExpiringSnapshots()).contains(expiringSnapshot); assertThat(expiringSnapshot.exists()).isTrue(); diff --git a/test/unit/org/apache/cassandra/service/snapshot/SnapshotManifestTest.java b/test/unit/org/apache/cassandra/service/snapshot/SnapshotManifestTest.java index d3b11c0643a0..5eac6a2b78b6 100644 --- a/test/unit/org/apache/cassandra/service/snapshot/SnapshotManifestTest.java +++ b/test/unit/org/apache/cassandra/service/snapshot/SnapshotManifestTest.java @@ -108,7 +108,7 @@ public void testIngoredFields() throws IOException { @Test public void testSerializeAndDeserialize() throws Exception { - SnapshotManifest manifest = new SnapshotManifest(Arrays.asList("db1", "db2", "db3"), new DurationSpec.IntSecondsBound("2m"), Instant.ofEpochMilli(currentTimeMillis())); + SnapshotManifest manifest = new SnapshotManifest(Arrays.asList("db1", "db2", "db3"), new DurationSpec.IntSecondsBound("2m"), Instant.ofEpochMilli(currentTimeMillis()), false); File manifestFile = new File(tempFolder.newFile("manifest.json")); manifest.serializeToJsonFile(manifestFile); diff --git a/test/unit/org/apache/cassandra/service/snapshot/TableSnapshotTest.java b/test/unit/org/apache/cassandra/service/snapshot/TableSnapshotTest.java index 4bb1756c3183..c1614dffdc4c 100644 --- a/test/unit/org/apache/cassandra/service/snapshot/TableSnapshotTest.java +++ b/test/unit/org/apache/cassandra/service/snapshot/TableSnapshotTest.java @@ -74,7 +74,9 @@ public void testSnapshotExists() throws IOException "some", null, null, - folders); + folders, + false + ); assertThat(snapshot.exists()).isTrue(); @@ -95,7 +97,9 @@ public void testSnapshotExpiring() throws IOException "some", null, null, - folders); + folders, + false + ); assertThat(snapshot.isExpiring()).isFalse(); assertThat(snapshot.isExpired(now())).isFalse(); @@ -107,7 +111,9 @@ public void testSnapshotExpiring() throws IOException "some", now(), null, - folders); + folders, + false + ); assertThat(snapshot.isExpiring()).isFalse(); assertThat(snapshot.isExpired(now())).isFalse(); @@ -119,7 +125,9 @@ public void testSnapshotExpiring() throws IOException "some", now(), now().plusSeconds(1000), - folders); + folders, + false + ); assertThat(snapshot.isExpiring()).isTrue(); assertThat(snapshot.isExpired(now())).isFalse(); @@ -131,7 +139,8 @@ public void testSnapshotExpiring() throws IOException "some", now(), now().minusSeconds(1000), - folders); + folders, + false); assertThat(snapshot.isExpiring()).isTrue(); assertThat(snapshot.isExpired(now())).isTrue(); @@ -159,7 +168,8 @@ public void testComputeSizeOnDisk() throws IOException "some", null, null, - folders); + folders, + false); Long res = 0L; @@ -185,7 +195,9 @@ public void testComputeTrueSize() throws IOException "some", null, null, - folders); + folders, + false + ); Long res = 0L; @@ -214,7 +226,10 @@ public void testGetCreatedAt() throws IOException "some1", createdAt, null, - folders); + folders, + false + ); + assertThat(withCreatedAt.getCreatedAt()).isEqualTo(createdAt); // When createdAt is null, it should return the snapshot folder minimum update time @@ -225,7 +240,10 @@ public void testGetCreatedAt() throws IOException "some1", null, null, - folders); + folders, + false + ); + assertThat(withoutCreatedAt.getCreatedAt()).isEqualTo(Instant.ofEpochMilli(folders.stream().mapToLong(f -> f.lastModified()).min().getAsLong())); } From 91b86487fe5389765711b3266097fb3faa84e5dd Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Thu, 21 Jul 2022 14:59:36 -0400 Subject: [PATCH 024/159] Add guardrail to disallow DROP KEYSPACE commands for non superuser accounts Patch by Josh McKenzie; reviewed by Aleksey Yeschenko for CASSANDRA-17767 --- CHANGES.txt | 1 + NEWS.txt | 1 + conf/cassandra.yaml | 3 + .../org/apache/cassandra/config/Config.java | 1 + .../cassandra/config/GuardrailsOptions.java | 14 ++++ .../schema/DropKeyspaceStatement.java | 3 + .../cassandra/db/guardrails/Guardrails.java | 23 ++++++ .../db/guardrails/GuardrailsConfig.java | 7 ++ .../db/guardrails/GuardrailsMBean.java | 12 +++ .../guardrails/GuardrailDropKeyspaceTest.java | 79 +++++++++++++++++++ 10 files changed, 144 insertions(+) create mode 100644 test/unit/org/apache/cassandra/db/guardrails/GuardrailDropKeyspaceTest.java diff --git a/CHANGES.txt b/CHANGES.txt index b75eddd30e1f..49cb5cc679b9 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Add guardrail to disallow DROP KEYSPACE commands (CASSANDRA-17767) * Remove ephemeral snapshot marker file and introduce a flag to SnapshotManifest (CASSANDRA-16911) * Add a virtual table that exposes currently running queries (CASSANDRA-15241) * Allow sstableloader to specify table without relying on path (CASSANDRA-16584) diff --git a/NEWS.txt b/NEWS.txt index 9361b29a5e85..87946a9b3c80 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -70,6 +70,7 @@ New features - Whether ALTER TABLE commands are allowed to mutate columns - Whether SimpleStrategy is allowed on keyspace creation or alteration - Maximum replication factor + - Whether DROP KEYSPACE commands are allowed. - It is possible to list ephemeral snapshots by nodetool listsnaphots command when flag "-e" is specified. Upgrading diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml index a4e57307410a..b193a2a699ca 100644 --- a/conf/cassandra.yaml +++ b/conf/cassandra.yaml @@ -1724,6 +1724,9 @@ drop_compact_storage_enabled: false # Guardrail to allow/disallow TRUNCATE and DROP TABLE statements # drop_truncate_table_enabled: true # +# Guardrail to allow/disallow DROP KEYSPACE statements +# drop_keyspace_enabled: true +# # Guardrail to warn or fail when using a page size greater than threshold. # The two thresholds default to -1 to disable. # page_size_warn_threshold: -1 diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java index fb407ee202e6..dc752ffaa7e1 100644 --- a/src/java/org/apache/cassandra/config/Config.java +++ b/src/java/org/apache/cassandra/config/Config.java @@ -831,6 +831,7 @@ public static void setClientMode(boolean clientMode) public volatile boolean alter_table_enabled = true; public volatile boolean group_by_enabled = true; public volatile boolean drop_truncate_table_enabled = true; + public volatile boolean drop_keyspace_enabled = true; public volatile boolean secondary_indexes_enabled = true; public volatile boolean uncompressed_tables_enabled = true; public volatile boolean compact_tables_enabled = true; diff --git a/src/java/org/apache/cassandra/config/GuardrailsOptions.java b/src/java/org/apache/cassandra/config/GuardrailsOptions.java index 98d14a1d3266..e84e0e2a9f94 100644 --- a/src/java/org/apache/cassandra/config/GuardrailsOptions.java +++ b/src/java/org/apache/cassandra/config/GuardrailsOptions.java @@ -344,6 +344,20 @@ public void setDropTruncateTableEnabled(boolean enabled) x -> config.drop_truncate_table_enabled = x); } + @Override + public boolean getDropKeyspaceEnabled() + { + return config.drop_keyspace_enabled; + } + + public void setDropKeyspaceEnabled(boolean enabled) + { + updatePropertyWithLogging("drop_keyspace_enabled", + enabled, + () -> config.drop_keyspace_enabled, + x -> config.drop_keyspace_enabled = x); + } + @Override public boolean getSecondaryIndexesEnabled() { diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/DropKeyspaceStatement.java b/src/java/org/apache/cassandra/cql3/statements/schema/DropKeyspaceStatement.java index f2bd30b249df..47e514a527fe 100644 --- a/src/java/org/apache/cassandra/cql3/statements/schema/DropKeyspaceStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/schema/DropKeyspaceStatement.java @@ -21,6 +21,7 @@ import org.apache.cassandra.audit.AuditLogEntryType; import org.apache.cassandra.auth.Permission; import org.apache.cassandra.cql3.CQLStatement; +import org.apache.cassandra.db.guardrails.Guardrails; import org.apache.cassandra.schema.Keyspaces; import org.apache.cassandra.schema.Keyspaces.KeyspacesDiff; import org.apache.cassandra.service.ClientState; @@ -39,6 +40,8 @@ public DropKeyspaceStatement(String keyspaceName, boolean ifExists) public Keyspaces apply(Keyspaces schema) { + Guardrails.dropKeyspaceEnabled.ensureEnabled(state); + if (schema.containsKeyspace(keyspaceName)) return schema.without(keyspaceName); diff --git a/src/java/org/apache/cassandra/db/guardrails/Guardrails.java b/src/java/org/apache/cassandra/db/guardrails/Guardrails.java index 36bb3d446f6b..1381655955a7 100644 --- a/src/java/org/apache/cassandra/db/guardrails/Guardrails.java +++ b/src/java/org/apache/cassandra/db/guardrails/Guardrails.java @@ -153,11 +153,22 @@ public final class Guardrails implements GuardrailsMBean state -> CONFIG_PROVIDER.getOrCreate(state).getAlterTableEnabled(), "User access to ALTER TABLE statement for column mutation"); + /** + * Guardrail disabling DROP / TRUNCATE TABLE behavior + */ public static final EnableFlag dropTruncateTableEnabled = new EnableFlag("drop_truncate_table_enabled", state -> CONFIG_PROVIDER.getOrCreate(state).getDropTruncateTableEnabled(), "DROP and TRUNCATE TABLE functionality"); + /** + * Guardrail disabling DROP KEYSPACE behavior + */ + public static final EnableFlag dropKeyspaceEnabled = + new EnableFlag("drop_keyspace_enabled", + state -> CONFIG_PROVIDER.getOrCreate(state).getDropKeyspaceEnabled(), + "DROP KEYSPACE functionality"); + /** * Guardrail disabling user's ability to turn off compression */ @@ -648,6 +659,18 @@ public void setDropTruncateTableEnabled(boolean enabled) DEFAULT_CONFIG.setDropTruncateTableEnabled(enabled); } + @Override + public boolean getDropKeyspaceEnabled() + { + return DEFAULT_CONFIG.getDropKeyspaceEnabled(); + } + + @Override + public void setDropKeyspaceEnabled(boolean enabled) + { + DEFAULT_CONFIG.setDropKeyspaceEnabled(enabled); + } + @Override public int getPageSizeWarnThreshold() { diff --git a/src/java/org/apache/cassandra/db/guardrails/GuardrailsConfig.java b/src/java/org/apache/cassandra/db/guardrails/GuardrailsConfig.java index c7067b53e048..d21b8992419c 100644 --- a/src/java/org/apache/cassandra/db/guardrails/GuardrailsConfig.java +++ b/src/java/org/apache/cassandra/db/guardrails/GuardrailsConfig.java @@ -167,6 +167,13 @@ public interface GuardrailsConfig */ boolean getDropTruncateTableEnabled(); + /** + * Returns whether DROP on keyspaces is allowed + * + * @return {@code true} if allowed, {@code false} otherwise. + */ + boolean getDropKeyspaceEnabled(); + /** * @return The threshold to warn when page size exceeds given size. */ diff --git a/src/java/org/apache/cassandra/db/guardrails/GuardrailsMBean.java b/src/java/org/apache/cassandra/db/guardrails/GuardrailsMBean.java index dc3fb48e228f..e410d5c16f14 100644 --- a/src/java/org/apache/cassandra/db/guardrails/GuardrailsMBean.java +++ b/src/java/org/apache/cassandra/db/guardrails/GuardrailsMBean.java @@ -304,6 +304,18 @@ public interface GuardrailsMBean */ void setDropTruncateTableEnabled(boolean enabled); + /** + * Returns whether users can DROP a keyspace + * + * @return {@code true} if allowed, {@code false} otherwise. + */ + boolean getDropKeyspaceEnabled(); + + /** + * Sets whether users can DROP a keyspace + */ + void setDropKeyspaceEnabled(boolean enabled); + /** * @return The threshold to warn when requested page size greater than threshold. * -1 means disabled. diff --git a/test/unit/org/apache/cassandra/db/guardrails/GuardrailDropKeyspaceTest.java b/test/unit/org/apache/cassandra/db/guardrails/GuardrailDropKeyspaceTest.java new file mode 100644 index 000000000000..de447259fee6 --- /dev/null +++ b/test/unit/org/apache/cassandra/db/guardrails/GuardrailDropKeyspaceTest.java @@ -0,0 +1,79 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.db.guardrails; + +import org.junit.After; +import org.junit.Test; + +public class GuardrailDropKeyspaceTest extends GuardrailTester +{ + private String keyspaceQuery = "CREATE KEYSPACE dkdt WITH replication = {'class': 'SimpleStrategy', 'replication_factor' : 1}"; + + private void setGuardrail(boolean enabled) + { + Guardrails.instance.setDropKeyspaceEnabled(enabled); + } + + public GuardrailDropKeyspaceTest() + { + super(Guardrails.dropKeyspaceEnabled); + } + + @After + public void afterTest() throws Throwable + { + setGuardrail(true); + execute("DROP KEYSPACE IF EXISTS dkdt"); + } + + @Test + public void testCanDropWhileFeatureEnabled() throws Throwable + { + setGuardrail(true); + createKeyspace(keyspaceQuery); + execute("DROP KEYSPACE dkdt"); + } + + @Test + public void testCannotDropWhileFeatureDisabled() throws Throwable + { + setGuardrail(false); + createKeyspace(keyspaceQuery); + assertFails("DROP KEYSPACE dkdt", "DROP KEYSPACE functionality is not allowed"); + } + + @Test + public void testIfExistsDoesNotBypassCheck() throws Throwable + { + setGuardrail(false); + createKeyspace(keyspaceQuery); + assertFails("DROP KEYSPACE IF EXISTS dkdt", "DROP KEYSPACE functionality is not allowed"); + } + + @Test + public void testToggle() throws Throwable + { + setGuardrail(false); + createKeyspace(keyspaceQuery); + assertFails("DROP KEYSPACE IF EXISTS dkdt", "DROP KEYSPACE functionality is not allowed"); + + setGuardrail(true); + execute("DROP KEYSPACE dkdt"); + } +} From a57eae67e5d73f8ab3fd0ab172262380c8dc0280 Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Fri, 22 Jul 2022 13:34:41 -0400 Subject: [PATCH 025/159] Improve javadoc on CQLTester and GuardrailTester assertion methods Patch by Josh McKenzie; reviewed by Andres de la Pena for CASSANDRA-17772 --- .../org/apache/cassandra/cql3/CQLTester.java | 15 ++++++++++++-- .../db/guardrails/GuardrailTester.java | 20 +++++++++++++++++++ 2 files changed, 33 insertions(+), 2 deletions(-) diff --git a/test/unit/org/apache/cassandra/cql3/CQLTester.java b/test/unit/org/apache/cassandra/cql3/CQLTester.java index eae939b8a912..7c2eebb8c56a 100644 --- a/test/unit/org/apache/cassandra/cql3/CQLTester.java +++ b/test/unit/org/apache/cassandra/cql3/CQLTester.java @@ -1322,6 +1322,10 @@ public UntypedResultSet executeView(String query, Object... values) throws Throw return executeFormattedQuery(formatViewQuery(KEYSPACE, query), values); } + /** + * Executes the provided query using the {@link ClientState#forInternalCalls()} as the expected ClientState. Note: + * this means permissions checking will not apply and queries will proceed regardless of role or guardrails. + */ protected UntypedResultSet executeFormattedQuery(String query, Object... values) throws Throwable { UntypedResultSet rs; @@ -1738,8 +1742,15 @@ protected void assertInvalidThrowMessage(String errorMessage, Class Integer.MIN_VALUE is supplied, executes - // the query via the java driver, mimicking a real client. + /** + * Asserts that the query provided throws the exceptions provided. + * + * NOTE: This method uses {@link ClientState#forInternalCalls()} which sets the {@link ClientState#isInternal} value + * to true, nullifying any system keyspace or other permissions checking for tables. + * + * If a protocol version > Integer.MIN_VALUE is supplied, executes + * the query via the java driver, mimicking a real client. + */ protected void assertInvalidThrowMessage(Optional protocolVersion, String errorMessage, Class exception, diff --git a/test/unit/org/apache/cassandra/db/guardrails/GuardrailTester.java b/test/unit/org/apache/cassandra/db/guardrails/GuardrailTester.java index 7c94702a2173..54523743b79e 100644 --- a/test/unit/org/apache/cassandra/db/guardrails/GuardrailTester.java +++ b/test/unit/org/apache/cassandra/db/guardrails/GuardrailTester.java @@ -67,6 +67,18 @@ import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; +/** + * This class provides specific utility methods for testing Guardrails that should be used instead of the provided + * {@link CQLTester} methods. Many of the methods in CQLTester don't respect the {@link ClientState} provided for a query + * and instead use {@link ClientState#forInternalCalls()} which flags as an internal query and thus bypasses auth and + * guardrail checks. + * + * Some GuardrailTester methods and their usage is as follows: + * {@link GuardrailTester#assertValid(String)} to confirm the query as structured is valid given the state of the db + * {@link GuardrailTester#assertWarns(String, String)} to confirm a query succeeds but warns the text provided + * {@link GuardrailTester#assertFails(String, String)} to confirm a query fails with the message provided + * {@link GuardrailTester#testExcludedUsers} to confirm superusers are excluded from application of the guardrail + */ public abstract class GuardrailTester extends CQLTester { // Name used when testing CREATE TABLE that should be aborted (we need to provide it as assertFails, which @@ -318,6 +330,10 @@ protected void assertFails(CheckedFunction function, List messages, List assertFails(function, true, messages, redactedMessages); } + /** + * Unlike {@link CQLTester#assertInvalidThrowMessage}, the chain of methods ending here in {@link GuardrailTester} + * respect the input ClientState so guardrails permissions will be correctly checked. + */ protected void assertFails(CheckedFunction function, boolean thrown, List messages, List redactedMessages) throws Throwable { ClientWarn.instance.captureWarnings(); @@ -478,6 +494,10 @@ protected ResultMessage execute(ClientState state, String query, ConsistencyLeve return execute(state, query, options); } + /** + * Performs execution of query using the input {@link ClientState} (i.e. unlike {@link ClientState#forInternalCalls()} + * which may not) to ensure guardrails are approprieately applied to the query provided. + */ protected ResultMessage execute(ClientState state, String query, QueryOptions options) { QueryState queryState = new QueryState(state); From e0a6b83a02804bf976fdc43718001f23818ee53d Mon Sep 17 00:00:00 2001 From: David Capwell Date: Mon, 25 Jul 2022 12:26:35 -0700 Subject: [PATCH 026/159] When bootstrap fails, CassandraRoleManager may attempt to do read queries that fail with "Cannot read from a bootstrapping node", and increments unavailables counters patch by David Capwell; reviewed by Sam Tunnicliffe for CASSANDRA-17754 --- CHANGES.txt | 1 + .../cassandra/auth/CassandraRoleManager.java | 7 + .../cassandra/service/StorageProxy.java | 19 ++- .../distributed/shared/ClusterUtils.java | 46 ++++++ .../hostreplacement/FailedBootstrapTest.java | 138 ++++++++++++++++++ .../hostreplacement/HostReplacementTest.java | 3 + 6 files changed, 212 insertions(+), 2 deletions(-) create mode 100644 test/distributed/org/apache/cassandra/distributed/test/hostreplacement/FailedBootstrapTest.java diff --git a/CHANGES.txt b/CHANGES.txt index 33753cb531bc..63e8fdd32877 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * When bootstrap fails, CassandraRoleManager may attempt to do read queries that fail with "Cannot read from a bootstrapping node", and increments unavailables counters (CASSANDRA-17754) * Add guardrail to disallow DROP KEYSPACE commands (CASSANDRA-17767) * Remove ephemeral snapshot marker file and introduce a flag to SnapshotManifest (CASSANDRA-16911) * Add a virtual table that exposes currently running queries (CASSANDRA-15241) diff --git a/src/java/org/apache/cassandra/auth/CassandraRoleManager.java b/src/java/org/apache/cassandra/auth/CassandraRoleManager.java index 0344de921db0..c2272707ecd2 100644 --- a/src/java/org/apache/cassandra/auth/CassandraRoleManager.java +++ b/src/java/org/apache/cassandra/auth/CassandraRoleManager.java @@ -43,6 +43,7 @@ import org.apache.cassandra.db.marshal.UTF8Type; import org.apache.cassandra.exceptions.*; import org.apache.cassandra.service.ClientState; +import org.apache.cassandra.service.StorageProxy; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.transport.messages.ResultMessage; import org.apache.cassandra.utils.ByteBufferUtil; @@ -386,6 +387,12 @@ protected void scheduleSetupTask(final Callable setupTask) { // The delay is to give the node a chance to see its peers before attempting the operation ScheduledExecutors.optionalTasks.scheduleSelfRecurring(() -> { + if (!StorageProxy.isSafeToPerformRead()) + { + logger.trace("Setup task may not run due to it not being safe to perform reads... rescheduling"); + scheduleSetupTask(setupTask); + return; + } try { setupTask.call(); diff --git a/src/java/org/apache/cassandra/service/StorageProxy.java b/src/java/org/apache/cassandra/service/StorageProxy.java index 557382df5488..e89bdae717d0 100644 --- a/src/java/org/apache/cassandra/service/StorageProxy.java +++ b/src/java/org/apache/cassandra/service/StorageProxy.java @@ -1823,7 +1823,7 @@ public static RowIterator readOne(SinglePartitionReadCommand command, Consistenc public static PartitionIterator read(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel, long queryStartNanoTime) throws UnavailableException, IsBootstrappingException, ReadFailureException, ReadTimeoutException, InvalidRequestException { - if (StorageService.instance.isBootstrapMode() && !systemKeyspaceQuery(group.queries)) + if (!isSafeToPerformRead(group.queries)) { readMetrics.unavailables.mark(); readMetricsForLevel(consistencyLevel).unavailables.mark(); @@ -1850,6 +1850,16 @@ public static PartitionIterator read(SinglePartitionReadCommand.Group group, Con : readRegular(group, consistencyLevel, queryStartNanoTime); } + public static boolean isSafeToPerformRead(List queries) + { + return isSafeToPerformRead() || systemKeyspaceQuery(queries); + } + + public static boolean isSafeToPerformRead() + { + return !StorageService.instance.isBootstrapMode(); + } + private static PartitionIterator readWithPaxos(SinglePartitionReadCommand.Group group, ConsistencyLevel consistencyLevel, long queryStartNanoTime) throws InvalidRequestException, UnavailableException, ReadFailureException, ReadTimeoutException { @@ -2619,8 +2629,13 @@ public long startTimeNanos() public static void logRequestException(Exception exception, Collection commands) { + // Multiple different types of errors can happen, so by dedupping on the error type we can see each error + // case rather than just exposing the first error seen; this should make sure more rare issues are exposed + // rather than being hidden by more common errors such as timeout or unavailable + // see CASSANDRA-17754 + String msg = exception.getClass().getSimpleName() + " \"{}\" while executing {}"; NoSpamLogger.log(logger, NoSpamLogger.Level.INFO, FAILURE_LOGGING_INTERVAL_SECONDS, TimeUnit.SECONDS, - "\"{}\" while executing {}", + msg, () -> new Object[] { exception.getMessage(), diff --git a/test/distributed/org/apache/cassandra/distributed/shared/ClusterUtils.java b/test/distributed/org/apache/cassandra/distributed/shared/ClusterUtils.java index dc280f3085e2..d848d201dc85 100644 --- a/test/distributed/org/apache/cassandra/distributed/shared/ClusterUtils.java +++ b/test/distributed/org/apache/cassandra/distributed/shared/ClusterUtils.java @@ -30,6 +30,7 @@ import java.util.Objects; import java.util.Optional; import java.util.Set; +import java.util.UUID; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; import java.util.function.BiConsumer; @@ -40,6 +41,10 @@ import java.util.stream.Collectors; import com.google.common.util.concurrent.Futures; + +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.gms.ApplicationState; +import org.apache.cassandra.gms.VersionedValue; import org.apache.cassandra.io.util.File; import org.junit.Assert; @@ -554,6 +559,47 @@ public static Map> awaitGossipStatus(IInstance insta }); } + public static void awaitGossipSchemaMatch(ICluster cluster) + { + cluster.forEach(ClusterUtils::awaitGossipSchemaMatch); + } + + public static void awaitGossipSchemaMatch(IInstance instance) + { + if (!instance.config().has(Feature.GOSSIP)) + { + // when gosisp isn't enabled, don't bother waiting on gossip to settle... + return; + } + awaitGossip(instance, "Schema IDs did not match", all -> { + String current = null; + for (Map.Entry> e : all.entrySet()) + { + Map state = e.getValue(); + // has the instance joined? + String status = state.get(ApplicationState.STATUS_WITH_PORT.name()); + if (status == null) + status = state.get(ApplicationState.STATUS.name()); + if (status == null || !status.contains(VersionedValue.STATUS_NORMAL)) + continue; // ignore instances not joined yet + String schema = state.get("SCHEMA"); + if (schema == null) + throw new AssertionError("Unable to find schema for " + e.getKey() + "; status was " + status); + schema = schema.split(":")[1]; + + if (current == null) + { + current = schema; + } + else if (!current.equals(schema)) + { + return false; + } + } + return true; + }); + } + /** * Get the gossip information from the node. Currently only address, generation, and heartbeat are returned * diff --git a/test/distributed/org/apache/cassandra/distributed/test/hostreplacement/FailedBootstrapTest.java b/test/distributed/org/apache/cassandra/distributed/test/hostreplacement/FailedBootstrapTest.java new file mode 100644 index 000000000000..56de09284466 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/hostreplacement/FailedBootstrapTest.java @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.hostreplacement; + +import java.io.IOException; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.util.Arrays; +import java.util.List; +import java.util.concurrent.Callable; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +import org.junit.Test; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import net.bytebuddy.ByteBuddy; +import net.bytebuddy.dynamic.loading.ClassLoadingStrategy; +import net.bytebuddy.implementation.MethodDelegation; +import net.bytebuddy.implementation.bind.annotation.SuperCall; +import net.bytebuddy.implementation.bind.annotation.This; +import org.apache.cassandra.auth.CassandraRoleManager; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.NodeToolResult; +import org.apache.cassandra.distributed.api.TokenSupplier; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.metrics.ClientRequestsMetricsHolder; +import org.apache.cassandra.streaming.StreamException; +import org.apache.cassandra.streaming.StreamResultFuture; +import org.assertj.core.api.Assertions; +import org.awaitility.Awaitility; + +import static net.bytebuddy.matcher.ElementMatchers.named; +import static org.apache.cassandra.distributed.shared.ClusterUtils.replaceHostAndStart; +import static org.apache.cassandra.distributed.shared.ClusterUtils.stopUnchecked; +import static org.apache.cassandra.distributed.test.hostreplacement.HostReplacementTest.setupCluster; + +public class FailedBootstrapTest extends TestBaseImpl +{ + private static final Logger logger = LoggerFactory.getLogger(FailedBootstrapTest.class); + + private static final int NODE_TO_REMOVE = 2; + + @Test + public void roleSetupDoesNotProduceUnavailables() throws IOException + { + Cluster.Builder builder = Cluster.build(3) + .withConfig(c -> c.with(Feature.values())) + .withInstanceInitializer(BB::install); + TokenSupplier even = TokenSupplier.evenlyDistributedTokens(3, builder.getTokenCount()); + builder = builder.withTokenSupplier((TokenSupplier) node -> even.tokens(node == 4 ? NODE_TO_REMOVE : node)); + try (Cluster cluster = builder.start()) + { + List alive = Arrays.asList(cluster.get(1), cluster.get(3)); + IInvokableInstance nodeToRemove = cluster.get(NODE_TO_REMOVE); + + setupCluster(cluster); + + stopUnchecked(nodeToRemove); + + // should fail to join, but should start up! + IInvokableInstance added = replaceHostAndStart(cluster, nodeToRemove, p -> p.setProperty("cassandra.superuser_setup_delay_ms", "1")); + // log gossip for debugging + alive.forEach(i -> { + NodeToolResult result = i.nodetoolResult("gossipinfo"); + result.asserts().success(); + logger.info("gossipinfo for node{}\n{}", i.config().num(), result.getStdout()); + }); + + // CassandraRoleManager attempted to do distributed reads while bootstrap was still going (it failed, so still in bootstrap mode) + // so need to validate that is no longer happening and we incrementing org.apache.cassandra.metrics.ClientRequestMetrics.unavailables + // sleep larger than multiple retry attempts... + Awaitility.await() + .atMost(1, TimeUnit.MINUTES) + .until(() -> added.callOnInstance(() -> BB.SETUP_SCHEDULE_COUNTER.get()) >= 42); // why 42? just need something large enough to make sure multiple attempts happened + + // do we have any read metrics have unavailables? + added.runOnInstance(() -> { + Assertions.assertThat(ClientRequestsMetricsHolder.readMetrics.unavailables.getCount()).describedAs("read unavailables").isEqualTo(0); + Assertions.assertThat(ClientRequestsMetricsHolder.casReadMetrics.unavailables.getCount()).describedAs("CAS read unavailables").isEqualTo(0); + }); + } + } + + public static class BB + { + public static void install(ClassLoader classLoader, Integer num) + { + if (num != 4) + return; + + new ByteBuddy().rebase(StreamResultFuture.class) + .method(named("maybeComplete")) + .intercept(MethodDelegation.to(BB.class)) + .make() + .load(classLoader, ClassLoadingStrategy.Default.INJECTION); + + new ByteBuddy().rebase(CassandraRoleManager.class) + .method(named("scheduleSetupTask")) + .intercept(MethodDelegation.to(BB.class)) + .make() + .load(classLoader, ClassLoadingStrategy.Default.INJECTION); + } + + public static void maybeComplete(@This StreamResultFuture future) throws NoSuchMethodException, InvocationTargetException, IllegalAccessException + { + Method method = future.getClass().getSuperclass().getSuperclass().getDeclaredMethod("tryFailure", Throwable.class); + method.setAccessible(true); + method.invoke(future, new StreamException(future.getCurrentState(), "Stream failed")); + } + + private static final AtomicInteger SETUP_SCHEDULE_COUNTER = new AtomicInteger(0); + public static void scheduleSetupTask(final Callable setupTask, @SuperCall Runnable fn) + { + SETUP_SCHEDULE_COUNTER.incrementAndGet(); + fn.run(); + } + } +} diff --git a/test/distributed/org/apache/cassandra/distributed/test/hostreplacement/HostReplacementTest.java b/test/distributed/org/apache/cassandra/distributed/test/hostreplacement/HostReplacementTest.java index 3de0bf51d5e8..8219d43ad1f0 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/hostreplacement/HostReplacementTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/hostreplacement/HostReplacementTest.java @@ -35,6 +35,7 @@ import org.apache.cassandra.distributed.api.SimpleQueryResult; import org.apache.cassandra.distributed.api.TokenSupplier; import org.apache.cassandra.distributed.shared.AssertUtils; +import org.apache.cassandra.distributed.shared.ClusterUtils; import org.apache.cassandra.distributed.test.TestBaseImpl; import org.assertj.core.api.Assertions; @@ -210,6 +211,8 @@ static void setupCluster(Cluster cluster) fixDistributedSchemas(cluster); init(cluster); + ClusterUtils.awaitGossipSchemaMatch(cluster); + populate(cluster); cluster.forEach(i -> i.flush(KEYSPACE)); } From 1f067051537618804ca9d0c66b903be74b942b22 Mon Sep 17 00:00:00 2001 From: David Capwell Date: Tue, 26 Jul 2022 11:36:01 -0700 Subject: [PATCH 027/159] When doing a host replacement, -Dcassandra.broadcast_interval_ms is used to know when to check the ring but checks that the ring wasn't changed in -Dcassandra.ring_delay_ms, changes to ring delay should not depend on when we publish load stats patch by David Capwell; reviewed by Brandon Williams, Caleb Rackliffe for CASSANDRA-17776 --- CHANGES.txt | 1 + .../org/apache/cassandra/service/StorageService.java | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/CHANGES.txt b/CHANGES.txt index 63e8fdd32877..554b6262adc3 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * When doing a host replacement, -Dcassandra.broadcast_interval_ms is used to know when to check the ring but checks that the ring wasn't changed in -Dcassandra.ring_delay_ms, changes to ring delay should not depend on when we publish load stats (CASSANDRA-17776) * When bootstrap fails, CassandraRoleManager may attempt to do read queries that fail with "Cannot read from a bootstrapping node", and increments unavailables counters (CASSANDRA-17754) * Add guardrail to disallow DROP KEYSPACE commands (CASSANDRA-17767) * Remove ephemeral snapshot marker file and introduce a flag to SnapshotManifest (CASSANDRA-16911) diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java index 957daf395a66..ee79b3398c4a 100644 --- a/src/java/org/apache/cassandra/service/StorageService.java +++ b/src/java/org/apache/cassandra/service/StorageService.java @@ -1776,11 +1776,18 @@ public Collection prepareForBootstrap(long schemaTimeoutMill { if (!isReplacingSameAddress()) { + // Historically BROADCAST_INTERVAL was used, but this is unrelated to ring_delay, so using it to know + // how long to sleep only works with the default settings (ring_delay=30s, broadcast=60s). For users + // who are aware of this relationship, this coupling should not be broken, but for most users this + // relationship isn't known and instead we should rely on the ring_delay. + // See CASSANDRA-17776 + long sleepDelayMillis = Math.max(LoadBroadcaster.BROADCAST_INTERVAL, ringTimeoutMillis * 2); try { // Sleep additionally to make sure that the server actually is not alive // and giving it more time to gossip if alive. - Thread.sleep(LoadBroadcaster.BROADCAST_INTERVAL); + logger.info("Sleeping for {}ms waiting to make sure no new gossip updates happen for {}", sleepDelayMillis, DatabaseDescriptor.getReplaceAddress()); + Thread.sleep(sleepDelayMillis); } catch (InterruptedException e) { From 0daf21244fc0187d092616834d38df1a77dcabf0 Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Mon, 25 Jul 2022 11:58:42 -0400 Subject: [PATCH 028/159] Add JMX call to getSSTableCountPerTWCSBucket for TWCS Patch by Stefan Podkowinski; reviewed by Caleb Rackliffe and Marcus Eriksson for CASSANDRA-17774 Co-authored-by: Stefan Podkowinski Co-authored-by: Josh McKenzie --- CHANGES.txt | 1 + .../cassandra/db/ColumnFamilyStore.java | 18 ++++++++- .../cassandra/db/ColumnFamilyStoreMBean.java | 6 +++ .../compaction/CompactionStrategyManager.java | 33 +++++++++++++++ .../TimeWindowCompactionStrategy.java | 10 +++++ .../tools/nodetool/stats/StatsTable.java | 1 + .../nodetool/stats/TableStatsHolder.java | 1 + .../CompactionStrategyManagerTest.java | 40 ++++++++++++++++++- 8 files changed, 107 insertions(+), 3 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 7c9137f824fb..bdeef8172dff 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Added JMX call to getSSTableCountPerTWCSBucket for TWCS (CASSANDRA-17774) * When doing a host replacement, -Dcassandra.broadcast_interval_ms is used to know when to check the ring but checks that the ring wasn't changed in -Dcassandra.ring_delay_ms, changes to ring delay should not depend on when we publish load stats (CASSANDRA-17776) * When bootstrap fails, CassandraRoleManager may attempt to do read queries that fail with "Cannot read from a bootstrapping node", and increments unavailables counters (CASSANDRA-17754) * Add guardrail to disallow DROP KEYSPACE commands (CASSANDRA-17767) diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java index 94ca18084dfe..a1a5ce4df003 100644 --- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java +++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java @@ -22,7 +22,6 @@ import java.lang.reflect.Constructor; import java.lang.reflect.InvocationTargetException; import java.nio.ByteBuffer; -import java.nio.file.Path; import java.time.Instant; import java.util.ArrayList; import java.util.Arrays; @@ -609,6 +608,7 @@ public Directories getDirectories() return directories; } + @Override public List getDataPaths() throws IOException { List dataPaths = new ArrayList<>(); @@ -1925,12 +1925,14 @@ public List getSSTablesForKey(String key, boolean hexFormat) } } + @Override public void beginLocalSampling(String sampler, int capacity, int durationMillis) { metric.samplers.get(SamplerType.valueOf(sampler)).beginSampling(capacity, durationMillis); } @SuppressWarnings({ "rawtypes", "unchecked" }) + @Override public List finishLocalSampling(String sampler, int count) throws OpenDataException { Sampler samplerImpl = metric.samplers.get(SamplerType.valueOf(sampler)); @@ -1949,11 +1951,13 @@ public List finishLocalSampling(String sampler, int count) throws return result; } + @Override public boolean isCompactionDiskSpaceCheckEnabled() { return compactionSpaceCheck; } + @Override public void compactionDiskSpaceCheck(boolean enable) { compactionSpaceCheck = enable; @@ -2964,21 +2968,31 @@ public List getBuiltIndexes() return indexManager.getBuiltIndexNames(); } + @Override public int getUnleveledSSTables() { return compactionStrategyManager.getUnleveledSSTables(); } + @Override public int[] getSSTableCountPerLevel() { return compactionStrategyManager.getSSTableCountPerLevel(); } + @Override public long[] getPerLevelSizeBytes() { return compactionStrategyManager.getPerLevelSizeBytes(); } + @Override + public int[] getSSTableCountPerTWCSBucket() + { + return compactionStrategyManager.getSSTableCountPerTWCSBucket(); + } + + @Override public int getLevelFanoutSize() { return compactionStrategyManager.getLevelFanoutSize(); @@ -3074,6 +3088,7 @@ public void discardSSTables(long truncatedAt) } } + @Override public double getDroppableTombstoneRatio() { double allDroppable = 0; @@ -3088,6 +3103,7 @@ public double getDroppableTombstoneRatio() return allColumns > 0 ? allDroppable / allColumns : 0; } + @Override public long trueSnapshotsSize() { return getDirectories().trueSnapshotsSize(); diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStoreMBean.java b/src/java/org/apache/cassandra/db/ColumnFamilyStoreMBean.java index 5b6fd16fe101..d6740112319f 100644 --- a/src/java/org/apache/cassandra/db/ColumnFamilyStoreMBean.java +++ b/src/java/org/apache/cassandra/db/ColumnFamilyStoreMBean.java @@ -217,6 +217,12 @@ public List importNewSSTables(Set srcPaths, */ public long[] getPerLevelSizeBytes(); + /** + * @return sstable count for each bucket in TWCS. null unless time window compaction is used. + * array index corresponds to bucket(int[0] is for most recent, ...). + */ + public int[] getSSTableCountPerTWCSBucket(); + /** * @return sstable fanout size for level compaction strategy. */ diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java index 65359b1d88f5..ca67ddb0ea6f 100644 --- a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java +++ b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java @@ -23,14 +23,19 @@ import java.util.Arrays; import java.util.Collection; import java.util.Collections; +import java.util.Comparator; import java.util.ConcurrentModificationException; import java.util.HashSet; import java.util.List; +import java.util.Map; import java.util.Objects; import java.util.Set; +import java.util.TreeMap; import java.util.concurrent.locks.ReentrantReadWriteLock; import java.util.function.Supplier; import java.util.stream.Collectors; +import java.util.stream.Stream; +import java.util.stream.StreamSupport; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.ImmutableList; @@ -141,6 +146,8 @@ should update the compaction strategy in maybeReload() due to an ALTER. private volatile long maxSSTableSizeBytes; private volatile String name; + public static int TWCS_BUCKET_COUNT_MAX = 128; + public CompactionStrategyManager(ColumnFamilyStore cfs) { this(cfs, cfs::getDiskBoundaries, cfs.getPartitioner().splitter().isPresent()); @@ -610,6 +617,32 @@ public long[] getPerLevelSizeBytes() } } + public int[] getSSTableCountPerTWCSBucket() + { + readLock.lock(); + try + { + List> countsByBucket = Stream.concat( + StreamSupport.stream(repaired.allStrategies().spliterator(), false), + StreamSupport.stream(unrepaired.allStrategies().spliterator(), false)) + .filter((TimeWindowCompactionStrategy.class)::isInstance) + .map(s -> ((TimeWindowCompactionStrategy)s).getSSTableCountByBuckets()) + .collect(Collectors.toList()); + return countsByBucket.isEmpty() ? null : sumCountsByBucket(countsByBucket, TWCS_BUCKET_COUNT_MAX); + } + finally + { + readLock.unlock(); + } + } + + static int[] sumCountsByBucket(List> countsByBucket, int max) + { + TreeMap merged = new TreeMap<>(Comparator.reverseOrder()); + countsByBucket.stream().flatMap(e -> e.entrySet().stream()).forEach(e -> merged.merge(e.getKey(), e.getValue(), Integer::sum)); + return merged.values().stream().limit(max).mapToInt(i -> i).toArray(); + } + static int[] sumArrays(int[] a, int[] b) { int[] res = new int[Math.max(a.length, b.length)]; diff --git a/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategy.java b/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategy.java index d3b30210d490..9b9b82c82ff3 100644 --- a/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategy.java +++ b/src/java/org/apache/cassandra/db/compaction/TimeWindowCompactionStrategy.java @@ -30,6 +30,8 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.function.Function; +import java.util.stream.Collectors; import com.google.common.annotations.VisibleForTesting; import com.google.common.collect.*; @@ -57,6 +59,9 @@ public class TimeWindowCompactionStrategy extends AbstractCompactionStrategy private long lastExpiredCheck; private long highestWindowSeen; + // This is accessed in both the threading context of compaction / repair and also JMX + private volatile Map sstableCountByBuckets = Collections.emptyMap(); + public TimeWindowCompactionStrategy(ColumnFamilyStore cfs, Map options) { super(cfs, options); @@ -179,6 +184,7 @@ private List getCompactionCandidates(Iterable cand this.highestWindowSeen); this.estimatedRemainingTasks = mostInteresting.estimatedRemainingTasks; + this.sstableCountByBuckets = buckets.left.keySet().stream().collect(Collectors.toMap(Function.identity(), k -> buckets.left.get(k).size())); if (!mostInteresting.sstables.isEmpty()) return mostInteresting.sstables; return null; @@ -412,6 +418,10 @@ public long getMaxSSTableBytes() return Long.MAX_VALUE; } + public Map getSSTableCountByBuckets() + { + return sstableCountByBuckets; + } public static Map validateOptions(Map options) throws ConfigurationException { diff --git a/src/java/org/apache/cassandra/tools/nodetool/stats/StatsTable.java b/src/java/org/apache/cassandra/tools/nodetool/stats/StatsTable.java index 8b5090bd86ad..48b795b422b8 100644 --- a/src/java/org/apache/cassandra/tools/nodetool/stats/StatsTable.java +++ b/src/java/org/apache/cassandra/tools/nodetool/stats/StatsTable.java @@ -71,6 +71,7 @@ public class StatsTable public String droppedMutations; public List sstablesInEachLevel = new ArrayList<>(); public List sstableBytesInEachLevel = new ArrayList<>(); + public int[] sstableCountPerTWCSBucket = null; public Boolean isInCorrectLocation = null; // null: option not active public double droppableTombstoneRatio; public Map topSizePartitions; diff --git a/src/java/org/apache/cassandra/tools/nodetool/stats/TableStatsHolder.java b/src/java/org/apache/cassandra/tools/nodetool/stats/TableStatsHolder.java index c6b2301c2c0d..fc6ef145d67e 100644 --- a/src/java/org/apache/cassandra/tools/nodetool/stats/TableStatsHolder.java +++ b/src/java/org/apache/cassandra/tools/nodetool/stats/TableStatsHolder.java @@ -241,6 +241,7 @@ private void initializeKeyspaces(NodeProbe probe, boolean ignore, List t statsTable.sstablesInEachLevel.add(count + ((count > maxCount) ? "/" + maxCount : "")); } } + statsTable.sstableCountPerTWCSBucket = table.getSSTableCountPerTWCSBucket(); long[] leveledSSTablesBytes = table.getPerLevelSizeBytes(); if (leveledSSTablesBytes != null) diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerTest.java index 9960e8e0fe0d..8a9f2fb22594 100644 --- a/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerTest.java +++ b/test/unit/org/apache/cassandra/db/compaction/CompactionStrategyManagerTest.java @@ -28,11 +28,14 @@ import java.util.concurrent.atomic.AtomicInteger; import java.util.stream.Collectors; +import com.google.common.collect.ImmutableList; +import com.google.common.collect.ImmutableMap; import com.google.common.collect.Iterables; import com.google.common.collect.Lists; import com.google.common.collect.Sets; import com.google.common.io.Files; import org.junit.AfterClass; +import org.junit.Assert; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; @@ -395,6 +398,41 @@ else if (sstable.isPendingRepair()) } } + @Test + public void testCountsByBuckets() + { + Assert.assertArrayEquals( + new int[] {2, 2, 4}, + CompactionStrategyManager.sumCountsByBucket(ImmutableList.of( + ImmutableMap.of(60000L, 1, 0L, 2, 180000L, 1), + ImmutableMap.of(60000L, 1, 0L, 2, 180000L, 1)), CompactionStrategyManager.TWCS_BUCKET_COUNT_MAX)); + Assert.assertArrayEquals( + new int[] {1, 1, 3}, + CompactionStrategyManager.sumCountsByBucket(ImmutableList.of( + ImmutableMap.of(60000L, 1, 0L, 1), + ImmutableMap.of(0L, 2, 180000L, 1)), CompactionStrategyManager.TWCS_BUCKET_COUNT_MAX)); + Assert.assertArrayEquals( + new int[] {1, 1}, + CompactionStrategyManager.sumCountsByBucket(ImmutableList.of( + ImmutableMap.of(60000L, 1, 0L, 1), + ImmutableMap.of()), CompactionStrategyManager.TWCS_BUCKET_COUNT_MAX)); + Assert.assertArrayEquals( + new int[] {8, 4}, + CompactionStrategyManager.sumCountsByBucket(ImmutableList.of( + ImmutableMap.of(60000L, 2, 0L, 1, 180000L, 4), + ImmutableMap.of(60000L, 2, 0L, 1, 180000L, 4)), 2)); + Assert.assertArrayEquals( + new int[] {1, 1, 2}, + CompactionStrategyManager.sumCountsByBucket(ImmutableList.of( + Collections.emptyMap(), + ImmutableMap.of(60000L, 1, 0L, 2, 180000L, 1)), CompactionStrategyManager.TWCS_BUCKET_COUNT_MAX)); + Assert.assertArrayEquals( + new int[] {}, + CompactionStrategyManager.sumCountsByBucket(ImmutableList.of( + Collections.emptyMap(), + Collections.emptyMap()), CompactionStrategyManager.TWCS_BUCKET_COUNT_MAX)); + } + private MockCFS createJBODMockCFS(int disks) { // Create #disks data directories to simulate JBOD @@ -464,8 +502,6 @@ private int getSSTableIndex(Integer[] boundaries, SSTableReader reader) return index; } - - class MockBoundaryManager { private final ColumnFamilyStore cfs; From 652e49a8bb2f8849ce0ebb5161c30e3889c14608 Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Wed, 27 Jul 2022 12:57:28 -0400 Subject: [PATCH 029/159] Add separate thread pool for Secondary Index building so it doesn't block compactions Patch by Chris Lohfink; reviewed by Caleb Rackliffe, Josh McKenzie, Sam Tunnicliffe, and Marcus Eriksson for CASSANDRA-17781 Co-authored-by: Chris Lohfink Co-authored-by: Josh McKenzie --- CHANGES.txt | 1 + .../org/apache/cassandra/config/Config.java | 3 +++ .../cassandra/config/DatabaseDescriptor.java | 5 +++++ .../db/compaction/CompactionManager.java | 21 ++++++++++++++++--- 4 files changed, 27 insertions(+), 3 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 105522af5b57..4dda88e31467 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Add separate thread pool for Secondary Index building so it doesn't block compactions (CASSANDRA-17781) * Added JMX call to getSSTableCountPerTWCSBucket for TWCS (CASSANDRA-17774) * When doing a host replacement, -Dcassandra.broadcast_interval_ms is used to know when to check the ring but checks that the ring wasn't changed in -Dcassandra.ring_delay_ms, changes to ring delay should not depend on when we publish load stats (CASSANDRA-17776) * When bootstrap fails, CassandraRoleManager may attempt to do read queries that fail with "Cannot read from a bootstrapping node", and increments unavailables counters (CASSANDRA-17754) diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java index a17c3591ce60..f7eabff8ded8 100644 --- a/src/java/org/apache/cassandra/config/Config.java +++ b/src/java/org/apache/cassandra/config/Config.java @@ -327,6 +327,9 @@ public MemtableOptions() public volatile int concurrent_materialized_view_builders = 1; public volatile int reject_repair_compaction_threshold = Integer.MAX_VALUE; + // The number of executors to use for building secondary indexes + public int concurrent_index_builders = 2; + /** * @deprecated retry support removed on CASSANDRA-10992 */ diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java index b60321e13135..0af1ef808fff 100644 --- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java +++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java @@ -2033,6 +2033,11 @@ public static int getConcurrentValidations() return conf.concurrent_validations; } + public static int getConcurrentIndexBuilders() + { + return conf.concurrent_index_builders; + } + public static void setConcurrentValidations(int value) { value = value > 0 ? value : Integer.MAX_VALUE; diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java index 925d900ada83..49b999e4dbd7 100644 --- a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java +++ b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java @@ -129,7 +129,13 @@ public class CompactionManager implements CompactionManagerMBean private final CompactionExecutor cacheCleanupExecutor = new CacheCleanupExecutor(); private final CompactionExecutor viewBuildExecutor = new ViewBuildExecutor(); - private final CompactionMetrics metrics = new CompactionMetrics(executor, validationExecutor, viewBuildExecutor); + // We can't house 2i builds in SecondaryIndexManagement because it could cause deadlocks with itself, and can cause + // massive to indefinite pauses if prioritized either before or after normal compactions so we instead put it in its + // own pool to prevent either scenario. + private final SecondaryIndexExecutor secondaryIndexExecutor = new SecondaryIndexExecutor(); + + private final CompactionMetrics metrics = new CompactionMetrics(executor, validationExecutor, viewBuildExecutor, secondaryIndexExecutor); + @VisibleForTesting final Multiset compactingCF = ConcurrentHashMultiset.create(); @@ -244,6 +250,7 @@ public void forceShutdown() validationExecutor.shutdown(); viewBuildExecutor.shutdown(); cacheCleanupExecutor.shutdown(); + secondaryIndexExecutor.shutdown(); // interrupt compactions and validations for (Holder compactionHolder : active.getCompactions()) @@ -254,7 +261,8 @@ public void forceShutdown() // wait for tasks to terminate // compaction tasks are interrupted above, so it shuold be fairy quick // until not interrupted tasks to complete. - for (ExecutorService exec : Arrays.asList(executor, validationExecutor, viewBuildExecutor, cacheCleanupExecutor)) + for (ExecutorService exec : Arrays.asList(executor, validationExecutor, viewBuildExecutor, + cacheCleanupExecutor, secondaryIndexExecutor)) { try { @@ -1772,7 +1780,7 @@ public void run() } }; - return executor.submitIfRunning(runnable, "index build"); + return secondaryIndexExecutor.submitIfRunning(runnable, "index build"); } /** @@ -2015,6 +2023,13 @@ public void incrementSstablesDropppedFromCompactions(long num) metrics.sstablesDropppedFromCompactions.inc(num); } + private static class SecondaryIndexExecutor extends CompactionExecutor + { + public SecondaryIndexExecutor() + { + super(DatabaseDescriptor.getConcurrentIndexBuilders(), "SecondaryIndexExecutor", Integer.MAX_VALUE); + } + } public List> getCompactions() { From e36aeb49e008568a2f551bb749bbb55aeaa80a72 Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Mon, 1 Aug 2022 14:37:26 -0400 Subject: [PATCH 030/159] Log duplicate rows found during nodetool verify and scrub Patch by Marcus Eriksson; reviewed by Josh McKenzie for CASSANDRA-17789 Co-authored-by: Marcus Eriksson Co-authored-by: Josh McKenzie --- CHANGES.txt | 1 + .../cassandra/db/compaction/Scrubber.java | 14 ++++- .../cassandra/db/compaction/Verifier.java | 63 ++++++++++++++++++- 3 files changed, 75 insertions(+), 3 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 4dda88e31467..78456e28ad93 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Log duplicate rows sharing a partition key found in verify and scrub (CASSANDRA-17789) * Add separate thread pool for Secondary Index building so it doesn't block compactions (CASSANDRA-17781) * Added JMX call to getSSTableCountPerTWCSBucket for TWCS (CASSANDRA-17774) * When doing a host replacement, -Dcassandra.broadcast_interval_ms is used to know when to check the ring but checks that the ring wasn't changed in -Dcassandra.ring_delay_ms, changes to ring delay should not depend on when we publish load stats (CASSANDRA-17776) diff --git a/src/java/org/apache/cassandra/db/compaction/Scrubber.java b/src/java/org/apache/cassandra/db/compaction/Scrubber.java index 5228d2f43fcd..c8518ce5e875 100644 --- a/src/java/org/apache/cassandra/db/compaction/Scrubber.java +++ b/src/java/org/apache/cassandra/db/compaction/Scrubber.java @@ -381,7 +381,7 @@ private boolean tryAppend(DecoratedKey prevKey, DecoratedKey key, SSTableRewrite @SuppressWarnings("resource") private UnfilteredRowIterator getIterator(DecoratedKey key) { - RowMergingSSTableIterator rowMergingIterator = new RowMergingSSTableIterator(SSTableIdentityIterator.create(sstable, dataFile, key)); + RowMergingSSTableIterator rowMergingIterator = new RowMergingSSTableIterator(SSTableIdentityIterator.create(sstable, dataFile, key), outputHandler); return reinsertOverflowedTTLRows ? new FixNegativeLocalDeletionTimeIterator(rowMergingIterator, outputHandler, negativeLocalDeletionInfoMetrics) : rowMergingIterator; @@ -565,10 +565,12 @@ public class NegativeLocalDeletionInfoMetrics private static class RowMergingSSTableIterator extends WrappingUnfilteredRowIterator { Unfiltered nextToOffer = null; + private final OutputHandler output; - RowMergingSSTableIterator(UnfilteredRowIterator source) + RowMergingSSTableIterator(UnfilteredRowIterator source, OutputHandler output) { super(source); + this.output = output; } @Override @@ -584,6 +586,7 @@ public Unfiltered next() if (next.isRow()) { + boolean logged = false; while (wrapped.hasNext()) { Unfiltered peek = wrapped.next(); @@ -595,6 +598,13 @@ public Unfiltered next() // Duplicate row, merge it. next = Rows.merge((Row) next, (Row) peek); + + if (!logged) + { + String partitionKey = metadata().partitionKeyType.getString(partitionKey().getKey()); + output.warn("Duplicate row detected in " + metadata().keyspace + '.' + metadata().name + ": " + partitionKey + " " + next.clustering().toString(metadata())); + logged = true; + } } } diff --git a/src/java/org/apache/cassandra/db/compaction/Verifier.java b/src/java/org/apache/cassandra/db/compaction/Verifier.java index 29eb29951b90..bad050a6cc25 100644 --- a/src/java/org/apache/cassandra/db/compaction/Verifier.java +++ b/src/java/org/apache/cassandra/db/compaction/Verifier.java @@ -22,6 +22,9 @@ import com.google.common.collect.ImmutableSet; import org.apache.cassandra.db.*; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.Row; +import org.apache.cassandra.db.rows.Unfiltered; import org.apache.cassandra.db.rows.UnfilteredRowIterator; import org.apache.cassandra.dht.LocalPartitioner; @@ -57,7 +60,9 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.nio.file.Files; +import java.time.Instant; import java.util.*; +import java.util.concurrent.TimeUnit; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; import java.util.concurrent.locks.ReentrantReadWriteLock; @@ -317,9 +322,40 @@ public void verify() if (key == null || dataSize > dataFile.length()) markAndThrow(new RuntimeException(String.format("key = %s, dataSize=%d, dataFile.length() = %d", key, dataSize, dataFile.length()))); - //mimic the scrub read path, intentionally unused try (UnfilteredRowIterator iterator = SSTableIdentityIterator.create(sstable, dataFile, key)) { + Row first = null; + int duplicateRows = 0; + long minTimestamp = Long.MAX_VALUE; + long maxTimestamp = Long.MIN_VALUE; + while (iterator.hasNext()) + { + Unfiltered uf = iterator.next(); + if (uf.isRow()) + { + Row row = (Row) uf; + if (first != null && first.clustering().equals(row.clustering())) + { + duplicateRows++; + for (Cell cell : row.cells()) + { + maxTimestamp = Math.max(cell.timestamp(), maxTimestamp); + minTimestamp = Math.min(cell.timestamp(), minTimestamp); + } + } + else + { + if (duplicateRows > 0) + logDuplicates(key, first, duplicateRows, minTimestamp, maxTimestamp); + duplicateRows = 0; + first = row; + maxTimestamp = Long.MIN_VALUE; + minTimestamp = Long.MAX_VALUE; + } + } + } + if (duplicateRows > 0) + logDuplicates(key, first, duplicateRows, minTimestamp, maxTimestamp); } if ( (prevKey != null && prevKey.compareTo(key) > 0) || !key.getKey().equals(currentIndexKey) || dataStart != dataStartFromIndex ) @@ -350,6 +386,31 @@ public void verify() outputHandler.output("Verify of " + sstable + " succeeded. All " + goodRows + " rows read successfully"); } + private void logDuplicates(DecoratedKey key, Row first, int duplicateRows, long minTimestamp, long maxTimestamp) + { + String keyString = sstable.metadata().partitionKeyType.getString(key.getKey()); + long firstMaxTs = Long.MIN_VALUE; + long firstMinTs = Long.MAX_VALUE; + for (Cell cell : first.cells()) + { + firstMaxTs = Math.max(firstMaxTs, cell.timestamp()); + firstMinTs = Math.min(firstMinTs, cell.timestamp()); + } + outputHandler.output(String.format("%d duplicate rows found for [%s %s] in %s.%s (%s), timestamps: [first row (%s, %s)], [duplicates (%s, %s, eq:%b)]", + duplicateRows, + keyString, first.clustering().toString(sstable.metadata()), + sstable.metadata().keyspace, + sstable.metadata().name, + sstable, + dateString(firstMinTs), dateString(firstMaxTs), + dateString(minTimestamp), dateString(maxTimestamp), minTimestamp == maxTimestamp)); + } + + private String dateString(long time) + { + return Instant.ofEpochMilli(TimeUnit.MICROSECONDS.toMillis(time)).toString(); + } + /** * Use the fact that check(..) is called with sorted tokens - we keep a pointer in to the normalized ranges * and only bump the pointer if the key given is out of range. This is done to avoid calling .contains(..) many From 037149377224c5d6854fa4a0cacf44139273bce3 Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Tue, 26 Jul 2022 15:15:48 -0400 Subject: [PATCH 031/159] Warn on unknown directories found in system keyspace directory rather than kill node during startup checks Patch by Jeff Jirsa; reviewed by Josh McKenzie, Sam Tunnicliffe, and Marcus Eriksson for CASSANDRA-17777 Co-authored-by: Jeff Jirsa Co-authored-by: Josh McKenzie --- CHANGES.txt | 1 + .../apache/cassandra/db/SystemKeyspace.java | 9 +++++ .../cassandra/service/StartupCheck.java | 2 +- .../cassandra/service/StartupChecks.java | 34 ++++++++++++++++++- .../paxos/uncommitted/PaxosStateTracker.java | 2 +- .../cassandra/service/StartupChecksTest.java | 11 ++++-- 6 files changed, 53 insertions(+), 6 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 78456e28ad93..c802607f423f 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Warn on unknown directories found in system keyspace directory rather than kill node during startup checks (CASSANDRA-17777) * Log duplicate rows sharing a partition key found in verify and scrub (CASSANDRA-17789) * Add separate thread pool for Secondary Index building so it doesn't block compactions (CASSANDRA-17781) * Added JMX call to getSSTableCountPerTWCSBucket for TWCS (CASSANDRA-17774) diff --git a/src/java/org/apache/cassandra/db/SystemKeyspace.java b/src/java/org/apache/cassandra/db/SystemKeyspace.java index a1013e795515..c8c21b5e2011 100644 --- a/src/java/org/apache/cassandra/db/SystemKeyspace.java +++ b/src/java/org/apache/cassandra/db/SystemKeyspace.java @@ -150,6 +150,7 @@ private SystemKeyspace() public static final String BATCHES = "batches"; public static final String PAXOS = "paxos"; public static final String PAXOS_REPAIR_HISTORY = "paxos_repair_history"; + public static final String PAXOS_REPAIR_STATE = "_paxos_repair_state"; public static final String BUILT_INDEXES = "IndexInfo"; public static final String LOCAL = "local"; public static final String PEERS_V2 = "peers_v2"; @@ -185,6 +186,14 @@ private SystemKeyspace() @Deprecated public static final String LEGACY_SIZE_ESTIMATES = "size_estimates"; @Deprecated public static final String LEGACY_SSTABLE_ACTIVITY = "sstable_activity"; + // Names of all tables that could have been a part of a system keyspace. Useful for pre-flight checks. + // For details, see CASSANDRA-17777 + public static final Set ALL_TABLE_NAMES = ImmutableSet.of( + BATCHES, PAXOS, PAXOS_REPAIR_HISTORY, PAXOS_REPAIR_STATE, BUILT_INDEXES, LOCAL, PEERS_V2, PEER_EVENTS_V2, + COMPACTION_HISTORY, SSTABLE_ACTIVITY_V2, TABLE_ESTIMATES, TABLE_ESTIMATES_TYPE_PRIMARY, + TABLE_ESTIMATES_TYPE_LOCAL_PRIMARY, AVAILABLE_RANGES_V2, TRANSFERRED_RANGES_V2, VIEW_BUILDS_IN_PROGRESS, + BUILT_VIEWS, PREPARED_STATEMENTS, REPAIRS, TOP_PARTITIONS, LEGACY_PEERS, LEGACY_PEER_EVENTS, + LEGACY_TRANSFERRED_RANGES, LEGACY_AVAILABLE_RANGES, LEGACY_SIZE_ESTIMATES, LEGACY_SSTABLE_ACTIVITY); public static final TableMetadata Batches = parse(BATCHES, diff --git a/src/java/org/apache/cassandra/service/StartupCheck.java b/src/java/org/apache/cassandra/service/StartupCheck.java index 331b381d2f8f..c3790e8e9931 100644 --- a/src/java/org/apache/cassandra/service/StartupCheck.java +++ b/src/java/org/apache/cassandra/service/StartupCheck.java @@ -31,7 +31,7 @@ * misconfiguration of cluster_name in cassandra.yaml. * * The StartupChecks class manages a collection of these tests, which it executes - * right at the beginning of the server settup process. + * right at the beginning of the server setup process. */ public interface StartupCheck { diff --git a/src/java/org/apache/cassandra/service/StartupChecks.java b/src/java/org/apache/cassandra/service/StartupChecks.java index 2ab5381697e9..c313ac8f0cce 100644 --- a/src/java/org/apache/cassandra/service/StartupChecks.java +++ b/src/java/org/apache/cassandra/service/StartupChecks.java @@ -165,7 +165,7 @@ public StartupChecks withTest(StartupCheck test) /** * Run the configured tests and return a report detailing the results. - * @throws org.apache.cassandra.exceptions.StartupException if any test determines that the + * @throws StartupException if any test determines that the * system is not in an valid state to startup * @param options options to pass to respective checks for their configration */ @@ -571,6 +571,38 @@ public FileVisitResult visitFile(Path path, BasicFileAttributes attrs) public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { + String[] nameParts = dir.toFile().getCanonicalPath().split(java.io.File.separator); + if (nameParts.length >= 2) + { + String tablePart = nameParts[nameParts.length - 1]; + String ksPart = nameParts[nameParts.length - 2]; + + if (tablePart.contains("-")) + tablePart = tablePart.split("-")[0]; + + // In very old versions of Cassandra, we wouldn't necessarily delete sstables from dropped system tables + // which were removed in various major version upgrades (e.g system.Versions in 1.2) + if (ksPart.equals(SchemaConstants.SYSTEM_KEYSPACE_NAME) && !SystemKeyspace.ALL_TABLE_NAMES.contains(tablePart)) + { + // We can have snapshots of our system tables or snapshots created with a -t tag of "system" that would trigger + // this potential warning, so we warn more softly in the case that it's probably a snapshot. + if (dir.toFile().getCanonicalPath().contains("snapshot")) + { + logger.info("Found unknown system directory {}.{} at {} that contains the word snapshot. " + + "This may be left over from a previous version of Cassandra or may be normal. " + + " Consider removing after inspection if determined to be unnecessary.", + ksPart, tablePart, dir.toFile().getCanonicalPath()); + } + else + { + logger.warn("Found unknown system directory {}.{} at {} - this is likely left over from a previous " + + "version of Cassandra and should be removed after inspection.", + ksPart, tablePart, dir.toFile().getCanonicalPath()); + } + return FileVisitResult.SKIP_SUBTREE; + } + } + String name = dir.getFileName().toString(); return (name.equals(Directories.SNAPSHOT_SUBDIR) || name.equals(Directories.BACKUPS_SUBDIR) diff --git a/src/java/org/apache/cassandra/service/paxos/uncommitted/PaxosStateTracker.java b/src/java/org/apache/cassandra/service/paxos/uncommitted/PaxosStateTracker.java index d3594b397914..c86924935027 100644 --- a/src/java/org/apache/cassandra/service/paxos/uncommitted/PaxosStateTracker.java +++ b/src/java/org/apache/cassandra/service/paxos/uncommitted/PaxosStateTracker.java @@ -83,7 +83,7 @@ private static boolean truncateBallotMetadata() return Boolean.getBoolean(TRUNCATE_BALLOT_METADATA_PROP); } - private static final String DIRECTORY = "system/_paxos_repair_state"; + private static final String DIRECTORY = "system/" + SystemKeyspace.PAXOS_REPAIR_STATE; private final PaxosUncommittedTracker uncommitted; private final PaxosBallotTracker ballots; diff --git a/test/unit/org/apache/cassandra/service/StartupChecksTest.java b/test/unit/org/apache/cassandra/service/StartupChecksTest.java index 4a63fc1eb236..77c72530c4d0 100644 --- a/test/unit/org/apache/cassandra/service/StartupChecksTest.java +++ b/test/unit/org/apache/cassandra/service/StartupChecksTest.java @@ -36,13 +36,10 @@ import org.apache.cassandra.service.DataResurrectionCheck.Heartbeat; import org.apache.cassandra.utils.Clock; -import static java.time.Instant.ofEpochMilli; import static java.util.Collections.singletonList; import static org.apache.cassandra.io.util.FileUtils.createTempFile; -import static org.apache.cassandra.io.util.FileUtils.write; import static org.apache.cassandra.service.DataResurrectionCheck.HEARTBEAT_FILE_CONFIG_PROPERTY; import static org.apache.cassandra.service.StartupChecks.StartupCheckType.check_data_resurrection; -import static org.apache.cassandra.utils.Clock.Global.currentTimeMillis; import static org.junit.Assert.assertNotEquals; import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; @@ -115,6 +112,14 @@ public void failStartupIfInvalidSSTablesFound() throws Exception Files.createDirectories(backupDir); copyInvalidLegacySSTables(backupDir); startupChecks.verify(options); + + // and in the system directory as of CASSANDRA-17777 + new File(backupDir).deleteRecursive(); + File dataDir = new File(DatabaseDescriptor.getAllDataFileLocations()[0]); + Path systemDir = Paths.get(dataDir.absolutePath(), "system", "InvalidSystemDirectory"); + Files.createDirectories(systemDir); + copyInvalidLegacySSTables(systemDir); + startupChecks.verify(options); } @Test From f4f04fbac0eaa247dd028ddcb5a4e36dd8cd11ba Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Fri, 5 Aug 2022 07:29:02 -0400 Subject: [PATCH 032/159] Fix checkstyle failures on JDK8 introduced by CASSANDRA-17777 Patch by Josh McKenzie; reviewed by Marcus Eriksson for CASSANDRA-17777 --- .../org/apache/cassandra/service/StartupChecks.java | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/src/java/org/apache/cassandra/service/StartupChecks.java b/src/java/org/apache/cassandra/service/StartupChecks.java index c313ac8f0cce..c099045537c6 100644 --- a/src/java/org/apache/cassandra/service/StartupChecks.java +++ b/src/java/org/apache/cassandra/service/StartupChecks.java @@ -571,7 +571,7 @@ public FileVisitResult visitFile(Path path, BasicFileAttributes attrs) public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) throws IOException { - String[] nameParts = dir.toFile().getCanonicalPath().split(java.io.File.separator); + String[] nameParts = FileUtils.getCanonicalPath(new File(dir)).split(java.io.File.separator); if (nameParts.length >= 2) { String tablePart = nameParts[nameParts.length - 1]; @@ -584,20 +584,22 @@ public FileVisitResult preVisitDirectory(Path dir, BasicFileAttributes attrs) th // which were removed in various major version upgrades (e.g system.Versions in 1.2) if (ksPart.equals(SchemaConstants.SYSTEM_KEYSPACE_NAME) && !SystemKeyspace.ALL_TABLE_NAMES.contains(tablePart)) { + String canonicalPath = FileUtils.getCanonicalPath(new File(dir)); + // We can have snapshots of our system tables or snapshots created with a -t tag of "system" that would trigger // this potential warning, so we warn more softly in the case that it's probably a snapshot. - if (dir.toFile().getCanonicalPath().contains("snapshot")) + if (canonicalPath.contains("snapshot")) { logger.info("Found unknown system directory {}.{} at {} that contains the word snapshot. " + "This may be left over from a previous version of Cassandra or may be normal. " + " Consider removing after inspection if determined to be unnecessary.", - ksPart, tablePart, dir.toFile().getCanonicalPath()); + ksPart, tablePart, canonicalPath); } else { logger.warn("Found unknown system directory {}.{} at {} - this is likely left over from a previous " + "version of Cassandra and should be removed after inspection.", - ksPart, tablePart, dir.toFile().getCanonicalPath()); + ksPart, tablePart, canonicalPath); } return FileVisitResult.SKIP_SUBTREE; } From 45f4f8c1e89e4b221b569ff3bd3e78675eff7747 Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Tue, 2 Aug 2022 14:30:06 -0400 Subject: [PATCH 033/159] Users of NativeLibrary should handle lack of JNA appropriately when running in client mode Patch by Doug Rohrer; reviewd by Josh McKenzie and Caleb Rackliffe for CASSANDRA-17794 Co-authored-by: Doug Rohrer Co-authored-by: Josh McKenzie --- CHANGES.txt | 1 + .../cassandra/db/lifecycle/LogReplica.java | 23 +++++++++++++++++-- .../apache/cassandra/hints/HintsCatalog.java | 7 +++++- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index c802607f423f..3342318ea252 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Users of NativeLibrary should handle lack of JNA appropriately when running in client mode (CASSANDRA-17794) * Warn on unknown directories found in system keyspace directory rather than kill node during startup checks (CASSANDRA-17777) * Log duplicate rows sharing a partition key found in verify and scrub (CASSANDRA-17789) * Add separate thread pool for Secondary Index building so it doesn't block compactions (CASSANDRA-17781) diff --git a/src/java/org/apache/cassandra/db/lifecycle/LogReplica.java b/src/java/org/apache/cassandra/db/lifecycle/LogReplica.java index 1ea8b832e083..073ac7c61c16 100644 --- a/src/java/org/apache/cassandra/db/lifecycle/LogReplica.java +++ b/src/java/org/apache/cassandra/db/lifecycle/LogReplica.java @@ -23,6 +23,7 @@ import java.util.Map; import java.io.IOException; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.io.util.File; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -58,7 +59,16 @@ static LogReplica create(File directory, String fileName) { int folderFD = NativeLibrary.tryOpenDirectory(directory.path()); if (folderFD == -1 && REQUIRE_FD) - throw new FSReadError(new IOException(String.format("Invalid folder descriptor trying to create log replica %s", directory.path())), directory.path()); + { + if (DatabaseDescriptor.isClientInitialized()) + { + logger.warn("Invalid folder descriptor trying to create log replica {}. Continuing without Native I/O support.", directory.path()); + } + else + { + throw new FSReadError(new IOException(String.format("Invalid folder descriptor trying to create log replica %s", directory.path())), directory.path()); + } + } return new LogReplica(new File(fileName), folderFD); } @@ -67,7 +77,16 @@ static LogReplica open(File file) { int folderFD = NativeLibrary.tryOpenDirectory(file.parent().path()); if (folderFD == -1) - throw new FSReadError(new IOException(String.format("Invalid folder descriptor trying to create log replica %s", file.parent().path())), file.parent().path()); + { + if (DatabaseDescriptor.isClientInitialized()) + { + logger.warn("Invalid folder descriptor trying to create log replica {}. Continuing without Native I/O support.", file.parentPath()); + } + else + { + throw new FSReadError(new IOException(String.format("Invalid folder descriptor trying to create log replica %s", file.parent().path())), file.parent().path()); + } + } return new LogReplica(file, folderFD); } diff --git a/src/java/org/apache/cassandra/hints/HintsCatalog.java b/src/java/org/apache/cassandra/hints/HintsCatalog.java index 859252f0ca58..ecde896b9019 100644 --- a/src/java/org/apache/cassandra/hints/HintsCatalog.java +++ b/src/java/org/apache/cassandra/hints/HintsCatalog.java @@ -26,10 +26,11 @@ import javax.annotation.Nullable; import com.google.common.collect.ImmutableMap; -import org.apache.cassandra.io.util.File; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.io.util.File; import org.apache.cassandra.io.FSError; import org.apache.cassandra.io.FSReadError; import org.apache.cassandra.io.FSWriteError; @@ -161,6 +162,10 @@ void fsyncDirectory() FileUtils.handleFSErrorAndPropagate(e); } } + else if (DatabaseDescriptor.isClientInitialized()) + { + logger.warn("Unable to open hint directory using Native library. Skipping sync."); + } else { logger.error("Unable to open directory {}", hintsDirectory.absolutePath()); From 8691d9b6d7a9500585e1582458fe38e71e4a5bd7 Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Thu, 4 Aug 2022 14:14:55 -0400 Subject: [PATCH 034/159] Add UUID to nodetool import logging Patch by Marcus Eriksson; reviewed by Josh McKenzie, Jordan West, and Sam Tunnicliffe for CASSANDRA-17800 Co-authored-by: Marcus Eriksson Co-authored-by: Josh McKenzie --- .../apache/cassandra/db/SSTableImporter.java | 43 +++++++++++++++---- 1 file changed, 34 insertions(+), 9 deletions(-) diff --git a/src/java/org/apache/cassandra/db/SSTableImporter.java b/src/java/org/apache/cassandra/db/SSTableImporter.java index 594955910e20..83b4e7bc2104 100644 --- a/src/java/org/apache/cassandra/db/SSTableImporter.java +++ b/src/java/org/apache/cassandra/db/SSTableImporter.java @@ -25,6 +25,7 @@ import java.util.List; import java.util.Map; import java.util.Set; +import java.util.UUID; import com.google.common.annotations.VisibleForTesting; @@ -69,8 +70,8 @@ public SSTableImporter(ColumnFamilyStore cfs) @VisibleForTesting synchronized List importNewSSTables(Options options) { - logger.info("Loading new SSTables for {}/{}: {}", - cfs.keyspace.getName(), cfs.getTableName(), options); + UUID importID = UUID.randomUUID(); + logger.info("[{}] Loading new SSTables for {}/{}: {}", importID, cfs.keyspace.getName(), cfs.getTableName(), options); List> listers = getSSTableListers(options.srcPaths); @@ -99,12 +100,12 @@ synchronized List importNewSSTables(Options options) { if (dir != null) { - logger.error("Failed verifying sstable {} in directory {}", descriptor, dir, t); + logger.error("[{}] Failed verifying sstable {} in directory {}", importID, descriptor, dir, t); failedDirectories.add(dir); } else { - logger.error("Failed verifying sstable {}", descriptor, t); + logger.error("[{}] Failed verifying sstable {}", importID, descriptor, t); throw new RuntimeException("Failed verifying sstable "+descriptor, t); } break; @@ -144,7 +145,7 @@ synchronized List importNewSSTables(Options options) newSSTablesPerDirectory.forEach(s -> s.selfRef().release()); if (dir != null) { - logger.error("Failed importing sstables in directory {}", dir, t); + logger.error("[{}] Failed importing sstables in directory {}", importID, dir, t); failedDirectories.add(dir); if (options.copyData) { @@ -160,7 +161,7 @@ synchronized List importNewSSTables(Options options) } else { - logger.error("Failed importing sstables from data directory - renamed sstables are: {}", movedSSTables); + logger.error("[{}] Failed importing sstables from data directory - renamed sstables are: {}", importID, movedSSTables); throw new RuntimeException("Failed importing sstables", t); } } @@ -170,11 +171,13 @@ synchronized List importNewSSTables(Options options) if (newSSTables.isEmpty()) { - logger.info("No new SSTables were found for {}/{}", cfs.keyspace.getName(), cfs.getTableName()); + logger.info("[{}] No new SSTables were found for {}/{}", importID, cfs.keyspace.getName(), cfs.getTableName()); return failedDirectories; } - logger.info("Loading new SSTables and building secondary indexes for {}/{}: {}", cfs.keyspace.getName(), cfs.getTableName(), newSSTables); + logger.info("[{}] Loading new SSTables and building secondary indexes for {}/{}: {}", importID, cfs.keyspace.getName(), cfs.getTableName(), newSSTables); + if (logger.isTraceEnabled()) + logLeveling(importID, newSSTables); try (Refs refs = Refs.ref(newSSTables)) { @@ -187,10 +190,32 @@ synchronized List importNewSSTables(Options options) } - logger.info("Done loading load new SSTables for {}/{}", cfs.keyspace.getName(), cfs.getTableName()); + logger.info("[{}] Done loading load new SSTables for {}/{}", importID, cfs.keyspace.getName(), cfs.getTableName()); return failedDirectories; } + private void logLeveling(UUID importID, Set newSSTables) + { + StringBuilder sb = new StringBuilder(); + for (SSTableReader sstable : cfs.getSSTables(SSTableSet.CANONICAL)) + sb.append(formatMetadata(sstable)); + logger.debug("[{}] Current sstables: {}", importID, sb); + sb = new StringBuilder(); + for (SSTableReader sstable : newSSTables) + sb.append(formatMetadata(sstable)); + logger.debug("[{}] New sstables: {}", importID, sb); + } + + private static String formatMetadata(SSTableReader sstable) + { + return String.format("{[%s, %s], %d, %s, %d}", + sstable.first.getToken(), + sstable.last.getToken(), + sstable.getSSTableLevel(), + sstable.isRepaired(), + sstable.onDiskLength()); + } + /** * Opens the sstablereader described by descriptor and figures out the correct directory for it based * on the first token From b9c40e25738f0ca7fc38e51a026743ead3420ab7 Mon Sep 17 00:00:00 2001 From: Blake Eggleston Date: Tue, 2 Aug 2022 10:51:15 -0700 Subject: [PATCH 035/159] Use seeded crc for PaxosBallotTracker checksum Patch by Blake Eggleston; Reviewed by Benedict Elliot Smith for CASSANDRA-17793 --- src/java/org/apache/cassandra/net/Crc.java | 2 +- .../service/paxos/uncommitted/PaxosBallotTracker.java | 9 ++++----- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/src/java/org/apache/cassandra/net/Crc.java b/src/java/org/apache/cassandra/net/Crc.java index 9cd6edd94f53..8f63e51a9353 100644 --- a/src/java/org/apache/cassandra/net/Crc.java +++ b/src/java/org/apache/cassandra/net/Crc.java @@ -45,7 +45,7 @@ public InvalidCrc(int read, int computed) } } - static CRC32 crc32() + public static CRC32 crc32() { CRC32 crc = crc32.get(); crc.reset(); diff --git a/src/java/org/apache/cassandra/service/paxos/uncommitted/PaxosBallotTracker.java b/src/java/org/apache/cassandra/service/paxos/uncommitted/PaxosBallotTracker.java index 7404bffae456..41314a2eafe5 100644 --- a/src/java/org/apache/cassandra/service/paxos/uncommitted/PaxosBallotTracker.java +++ b/src/java/org/apache/cassandra/service/paxos/uncommitted/PaxosBallotTracker.java @@ -25,6 +25,7 @@ import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Preconditions; + import org.apache.cassandra.service.ClientState; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -37,6 +38,7 @@ import org.apache.cassandra.service.paxos.Commit; import static org.apache.cassandra.io.util.SequentialWriterOption.FINISH_ON_CLOSE; +import static org.apache.cassandra.net.Crc.crc32; /** * Tracks the highest paxos ballot we've seen, and the lowest ballot we can accept. @@ -66,9 +68,6 @@ private PaxosBallotTracker(File directory, Ballot highBound, Ballot lowBound) this.lowBound = lowBound; } - /** - * creates a new crc32 instance seeded with a non-zero value - */ private static void serializeBallot(SequentialWriter writer, CRC32 crc, Ballot ballot) throws IOException { ByteBuffer bytes = ballot.toBytes(); @@ -105,7 +104,7 @@ public static PaxosBallotTracker load(File directory) throws IOException throw new IOException("Unsupported ballot file version: " + version); byte[] bytes = new byte[16]; - CRC32 crc = new CRC32(); + CRC32 crc = crc32(); Ballot highBallot = deserializeBallot(reader, crc, bytes); Ballot lowBallot = deserializeBallot(reader, crc, bytes); int checksum = Integer.reverseBytes(reader.readInt()); @@ -129,7 +128,7 @@ public synchronized void flush() throws IOException try(SequentialWriter writer = new SequentialWriter(file, FINISH_ON_CLOSE)) { - CRC32 crc = new CRC32(); + CRC32 crc = crc32(); writer.writeInt(FILE_VERSION); serializeBallot(writer, crc, getHighBound()); serializeBallot(writer, crc, getLowBound()); From c7d2e97da04e09dd908b1ea238a8498cfe5a3edb Mon Sep 17 00:00:00 2001 From: David Capwell Date: Mon, 8 Aug 2022 10:37:30 -0700 Subject: [PATCH 036/159] NPE bug in streaming checking if SSTable is being repaired patch by David Capwell; reviewed by Marcus Eriksson for CASSANDRA-17801 --- CHANGES.txt | 1 + .../cassandra/db/streaming/CassandraStreamManager.java | 6 +++++- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/CHANGES.txt b/CHANGES.txt index 64ee3e800d27..e33cd45a78e7 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * NPE bug in streaming checking if SSTable is being repaired (CASSANDRA-17801) * Users of NativeLibrary should handle lack of JNA appropriately when running in client mode (CASSANDRA-17794) * Warn on unknown directories found in system keyspace directory rather than kill node during startup checks (CASSANDRA-17777) * Log duplicate rows sharing a partition key found in verify and scrub (CASSANDRA-17789) diff --git a/src/java/org/apache/cassandra/db/streaming/CassandraStreamManager.java b/src/java/org/apache/cassandra/db/streaming/CassandraStreamManager.java index 46cf253d4d5d..8ca7ac5dacac 100644 --- a/src/java/org/apache/cassandra/db/streaming/CassandraStreamManager.java +++ b/src/java/org/apache/cassandra/db/streaming/CassandraStreamManager.java @@ -30,6 +30,7 @@ import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.io.sstable.metadata.StatsMetadata; import org.apache.cassandra.locator.RangesAtEndpoint; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.service.ActiveRepairService; @@ -105,7 +106,10 @@ else if (pendingRepair == ActiveRepairService.NO_PENDING_REPAIR) } else { - predicate = s -> s.isPendingRepair() && s.getSSTableMetadata().pendingRepair.equals(pendingRepair); + predicate = s -> { + StatsMetadata sstableMetadata = s.getSSTableMetadata(); + return sstableMetadata.pendingRepair != ActiveRepairService.NO_PENDING_REPAIR && sstableMetadata.pendingRepair.equals(pendingRepair); + }; } for (Range keyRange : keyRanges) From 72c2270a80f2acc8ece3eade4d6a2f8e8cb12356 Mon Sep 17 00:00:00 2001 From: Brad Schoening <5796692+bschoening@users.noreply.github.com> Date: Wed, 27 Jul 2022 22:57:52 -0400 Subject: [PATCH 037/159] Resolve pylint issues in pylexotron.py and improve readability Patch by Brad Schoening; reviewed by brandonwilliams and smiklosovic for CASSANDRA-17779 --- CHANGES.txt | 1 + pylib/cqlshlib/pylexotron.py | 148 +++++++++++++++++++---------------- 2 files changed, 82 insertions(+), 67 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index e33cd45a78e7..d1957a4667f9 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Cleanup pylint issues with pylexotron.py (CASSANDRA-17779) * NPE bug in streaming checking if SSTable is being repaired (CASSANDRA-17801) * Users of NativeLibrary should handle lack of JNA appropriately when running in client mode (CASSANDRA-17794) * Warn on unknown directories found in system keyspace directory rather than kill node during startup checks (CASSANDRA-17777) diff --git a/pylib/cqlshlib/pylexotron.py b/pylib/cqlshlib/pylexotron.py index 69f31dced770..c1fd55edbfd6 100644 --- a/pylib/cqlshlib/pylexotron.py +++ b/pylib/cqlshlib/pylexotron.py @@ -14,7 +14,12 @@ # See the License for the specific language governing permissions and # limitations under the License. +"""Pylexotron uses Python's re.Scanner module as a simple regex-based tokenizer for BNF production rules""" + import re +import inspect +import sys +from typing import Union from cqlshlib.saferscanner import SaferScanner @@ -56,8 +61,8 @@ def __repr__(self): return '%s(%r)' % (self.__class__, self.text) -def is_hint(x): - return isinstance(x, Hint) +def is_hint(obj): + return isinstance(obj, Hint) class ParseContext: @@ -115,7 +120,7 @@ def __repr__(self): % (self.__class__.__name__, self.matched, self.remainder, self.productionname, self.bindings) -class matcher: +class Matcher: def __init__(self, arg): self.arg = arg @@ -155,38 +160,38 @@ def __repr__(self): return '%s(%r)' % (self.__class__.__name__, self.arg) -class choice(matcher): +class Choice(Matcher): def match(self, ctxt, completions): foundctxts = [] - for a in self.arg: - subctxts = a.match(ctxt, completions) + for each in self.arg: + subctxts = each.match(ctxt, completions) foundctxts.extend(subctxts) return foundctxts -class one_or_none(matcher): +class OneOrNone(Matcher): def match(self, ctxt, completions): return [ctxt] + list(self.arg.match(ctxt, completions)) -class repeat(matcher): +class Repeat(Matcher): def match(self, ctxt, completions): found = [ctxt] ctxts = [ctxt] while True: new_ctxts = [] - for c in ctxts: - new_ctxts.extend(self.arg.match(c, completions)) + for each in ctxts: + new_ctxts.extend(self.arg.match(each, completions)) if not new_ctxts: return found found.extend(new_ctxts) ctxts = new_ctxts -class rule_reference(matcher): +class RuleReference(Matcher): def match(self, ctxt, completions): prevname = ctxt.productionname @@ -198,24 +203,24 @@ def match(self, ctxt, completions): return [c.with_production_named(prevname) for c in output] -class rule_series(matcher): +class RuleSeries(Matcher): def match(self, ctxt, completions): ctxts = [ctxt] for patpiece in self.arg: new_ctxts = [] - for c in ctxts: - new_ctxts.extend(patpiece.match(c, completions)) + for each in ctxts: + new_ctxts.extend(patpiece.match(each, completions)) if not new_ctxts: return () ctxts = new_ctxts return ctxts -class named_symbol(matcher): +class NamedSymbol(Matcher): def __init__(self, name, arg): - matcher.__init__(self, arg) + Matcher.__init__(self, arg) self.name = name def match(self, ctxt, completions): @@ -224,13 +229,14 @@ def match(self, ctxt, completions): # don't collect other completions under this; use a dummy pass_in_compls = set() results = self.arg.match_with_results(ctxt, pass_in_compls) - return [c.with_binding(self.name, ctxt.extract_orig(matchtoks)) for (c, matchtoks) in results] + return [c.with_binding(self.name, ctxt.extract_orig(matchtoks)) + for (c, matchtoks) in results] def __repr__(self): return '%s(%r, %r)' % (self.__class__.__name__, self.name, self.arg) -class named_collector(named_symbol): +class NamedCollector(NamedSymbol): def match(self, ctxt, completions): pass_in_compls = completions @@ -244,18 +250,21 @@ def match(self, ctxt, completions): return output -class terminal_matcher(matcher): +class TerminalMatcher(Matcher): + + def match(self, ctxt, completions): + raise NotImplementedError def pattern(self): raise NotImplementedError -class regex_rule(terminal_matcher): +class RegexRule(TerminalMatcher): def __init__(self, pat): - terminal_matcher.__init__(self, pat) + TerminalMatcher.__init__(self, pat) self.regex = pat - self.re = re.compile(pat + '$', re.I | re.S) + self.re = re.compile(pat + '$', re.IGNORECASE | re.DOTALL) def match(self, ctxt, completions): if ctxt.remainder: @@ -269,12 +278,12 @@ def pattern(self): return self.regex -class text_match(terminal_matcher): +class TextMatch(TerminalMatcher): alpha_re = re.compile(r'[a-zA-Z]') def __init__(self, text): try: - terminal_matcher.__init__(self, eval(text)) + TerminalMatcher.__init__(self, eval(text)) except SyntaxError: print("bad syntax %r" % (text,)) @@ -289,12 +298,13 @@ def match(self, ctxt, completions): def pattern(self): # can't use (?i) here- Scanner component regex flags won't be applied def ignorecaseify(matchobj): - c = matchobj.group(0) - return '[%s%s]' % (c.upper(), c.lower()) + val = matchobj.group(0) + return '[%s%s]' % (val.upper(), val.lower()) + return self.alpha_re.sub(ignorecaseify, re.escape(self.arg)) -class case_match(text_match): +class CaseMatch(TextMatch): def match(self, ctxt, completions): if ctxt.remainder: @@ -308,22 +318,22 @@ def pattern(self): return re.escape(self.arg) -class word_match(text_match): +class WordMatch(TextMatch): def pattern(self): - return r'\b' + text_match.pattern(self) + r'\b' + return r'\b' + TextMatch.pattern(self) + r'\b' -class case_word_match(case_match): +class CaseWordMatch(CaseMatch): def pattern(self): - return r'\b' + case_match.pattern(self) + r'\b' + return r'\b' + CaseMatch.pattern(self) + r'\b' -class terminal_type_matcher(matcher): +class TerminalTypeMatcher(Matcher): def __init__(self, tokentype, submatcher): - matcher.__init__(self, tokentype) + Matcher.__init__(self, tokentype) self.tokentype = tokentype self.submatcher = submatcher @@ -340,18 +350,24 @@ def __repr__(self): class ParsingRuleSet: + """Define the BNF tokenization rules for cql3handling.syntax_rules. Backus-Naur Form consists of + - Production rules in the form: Left-Hand-Side ::= Right-Hand-Side. The LHS is a non-terminal. + - Productions or non-terminal symbols + - Terminal symbols. Every terminal is a single token. + """ + RuleSpecScanner = SaferScanner([ - (r'::=', lambda s, t: t), + (r'::=', lambda s, t: t), # BNF rule definition (r'\[[a-z0-9_]+\]=', lambda s, t: ('named_collector', t[1:-2])), (r'[a-z0-9_]+=', lambda s, t: ('named_symbol', t[:-1])), (r'/(\[\^?.[^]]*\]|[^/]|\\.)*/', lambda s, t: ('regex', t[1:-1].replace(r'\/', '/'))), - (r'"([^"]|\\.)*"', lambda s, t: ('litstring', t)), + (r'"([^"]|\\.)*"', lambda s, t: ('string_literal', t)), (r'<[^>]*>', lambda s, t: ('reference', t[1:-1])), (r'\bJUNK\b', lambda s, t: ('junk', t)), (r'[@()|?*;]', lambda s, t: t), - (r'\s+', None), + (r'\s+', None), # whitespace (r'#[^\n]*', None), - ], re.I | re.S | re.U) + ], re.IGNORECASE | re.DOTALL | re.UNICODE) def __init__(self): self.ruleset = {} @@ -368,7 +384,7 @@ def from_rule_defs(cls, rule_defs): def parse_rules(cls, rulestr): tokens, unmatched = cls.RuleSpecScanner.scan(rulestr) if unmatched: - raise LexingError.from_text(rulestr, unmatched, msg="Syntax rules unparseable") + raise LexingError.from_text(rulestr, unmatched, msg="Syntax rules are unparseable") rules = {} terminals = [] tokeniter = iter(tokens) @@ -379,9 +395,9 @@ def parse_rules(cls, rulestr): raise ValueError('Unexpected token %r; expected "::="' % (assign,)) name = t[1] production = cls.read_rule_tokens_until(';', tokeniter) - if isinstance(production, terminal_matcher): + if isinstance(production, TerminalMatcher): terminals.append((name, production)) - production = terminal_type_matcher(name, production) + production = TerminalTypeMatcher(name, production) rules[name] = production else: raise ValueError('Unexpected token %r; expected name' % (t,)) @@ -392,11 +408,11 @@ def mkrule(pieces): if isinstance(pieces, (tuple, list)): if len(pieces) == 1: return pieces[0] - return rule_series(pieces) + return RuleSeries(pieces) return pieces @classmethod - def read_rule_tokens_until(cls, endtoks, tokeniter): + def read_rule_tokens_until(cls, endtoks: Union[str, int], tokeniter): if isinstance(endtoks, str): endtoks = (endtoks,) counttarget = None @@ -411,32 +427,32 @@ def read_rule_tokens_until(cls, endtoks, tokeniter): if t in endtoks: if len(mybranches) == 1: return cls.mkrule(mybranches[0]) - return choice(list(map(cls.mkrule, mybranches))) + return Choice(list(map(cls.mkrule, mybranches))) if isinstance(t, tuple): if t[0] == 'reference': - t = rule_reference(t[1]) - elif t[0] == 'litstring': + t = RuleReference(t[1]) + elif t[0] == 'string_literal': if t[1][1].isalnum() or t[1][1] == '_': - t = word_match(t[1]) + t = WordMatch(t[1]) else: - t = text_match(t[1]) + t = TextMatch(t[1]) elif t[0] == 'regex': - t = regex_rule(t[1]) + t = RegexRule(t[1]) elif t[0] == 'named_collector': - t = named_collector(t[1], cls.read_rule_tokens_until(1, tokeniter)) + t = NamedCollector(t[1], cls.read_rule_tokens_until(1, tokeniter)) elif t[0] == 'named_symbol': - t = named_symbol(t[1], cls.read_rule_tokens_until(1, tokeniter)) + t = NamedSymbol(t[1], cls.read_rule_tokens_until(1, tokeniter)) elif t == '(': t = cls.read_rule_tokens_until(')', tokeniter) elif t == '?': - t = one_or_none(myrules.pop(-1)) + t = OneOrNone(myrules.pop(-1)) elif t == '*': - t = repeat(myrules.pop(-1)) + t = Repeat(myrules.pop(-1)) elif t == '@': - x = next(tokeniter) - if not isinstance(x, tuple) or x[0] != 'litstring': - raise ValueError("Unexpected token %r following '@'" % (x,)) - t = case_match(x[1]) + val = next(tokeniter) + if not isinstance(val, tuple) or val[0] != 'string_literal': + raise ValueError("Unexpected token %r following '@'" % (val,)) + t = CaseMatch(val[1]) elif t == '|': myrules = [] mybranches.append(myrules) @@ -447,7 +463,7 @@ def read_rule_tokens_until(cls, endtoks, tokeniter): if countsofar == counttarget: if len(mybranches) == 1: return cls.mkrule(mybranches[0]) - return choice(list(map(cls.mkrule, mybranches))) + return Choice(list(map(cls.mkrule, mybranches))) raise ValueError('Unexpected end of rule tokens') def append_rules(self, rulestr): @@ -465,8 +481,9 @@ def make_handler(name): if name == 'JUNK': return None return lambda s, t: (name, t, s.match.span()) + regexes = [(p.pattern(), make_handler(name)) for (name, p) in self.terminals] - return SaferScanner(regexes, re.I | re.S | re.U).scan + return SaferScanner(regexes, re.IGNORECASE | re.DOTALL | re.UNICODE).scan def lex(self, text): if self.scanner is None: @@ -487,9 +504,9 @@ def whole_match(self, startsymbol, tokens, srcstr=None): bindings = {} if srcstr is not None: bindings['*SRC*'] = srcstr - for c in self.parse(startsymbol, tokens, init_bindings=bindings): - if not c.remainder: - return c + for val in self.parse(startsymbol, tokens, init_bindings=bindings): + if not val.remainder: + return val def lex_and_parse(self, text, startsymbol='Start'): return self.parse(startsymbol, self.lex(text), init_bindings={'*SRC*': text}) @@ -511,9 +528,6 @@ def complete(self, startsymbol, tokens, init_bindings=None): return completions -import sys - - class Debugotron(set): depth = 10 @@ -525,9 +539,9 @@ def add(self, item): self._note_addition(item) set.add(self, item) - def _note_addition(self, foo): - self.stream.write("\nitem %r added by:\n" % (foo,)) - frame = sys._getframe().f_back.f_back + def _note_addition(self, item): + self.stream.write("\nitem %r added by:\n" % (item,)) + frame = inspect.currentframe().f_back.f_back for i in range(self.depth): name = frame.f_code.co_name filename = frame.f_code.co_filename From 9a4a67782311e4a6ffe3aad0516c33016f187c54 Mon Sep 17 00:00:00 2001 From: Bernardo Botella Corbi Date: Mon, 1 Aug 2022 10:57:04 -0700 Subject: [PATCH 038/159] Removed Python < 2.7 support from formatting.py Patch by Bernardo Botella Corbi, reviewed by Brad Schoening, ycai and brandonwilliams for CASSANDRA-17694 --- CHANGES.txt | 1 + pylib/cqlshlib/formatting.py | 16 +++------------- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index d1957a4667f9..e25c228828f4 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Removed Python < 2.7 support from formatting.py (CASSANDRA-17694) * Cleanup pylint issues with pylexotron.py (CASSANDRA-17779) * NPE bug in streaming checking if SSTable is being repaired (CASSANDRA-17801) * Users of NativeLibrary should handle lack of JNA appropriately when running in client mode (CASSANDRA-17794) diff --git a/pylib/cqlshlib/formatting.py b/pylib/cqlshlib/formatting.py index b49a29aebdaf..39bc060485da 100644 --- a/pylib/cqlshlib/formatting.py +++ b/pylib/cqlshlib/formatting.py @@ -326,19 +326,9 @@ def format_integer_type(val, colormap, thousands_sep=None, **_): return colorme(bval, colormap, 'int') -# We can get rid of this in cassandra-2.2 -if sys.version_info >= (2, 7): - def format_integer_with_thousands_sep(val, thousands_sep=','): - return "{:,.0f}".format(val).replace(',', thousands_sep) -else: - def format_integer_with_thousands_sep(val, thousands_sep=','): - if val < 0: - return '-' + format_integer_with_thousands_sep(-val, thousands_sep) - result = '' - while val >= 1000: - val, r = divmod(val, 1000) - result = "%s%03d%s" % (thousands_sep, r, result) - return "%d%s" % (val, result) +def format_integer_with_thousands_sep(val, thousands_sep=','): + return "{:,.0f}".format(val).replace(',', thousands_sep) + formatter_for('long')(format_integer_type) formatter_for('int')(format_integer_type) From 570732375e4186741388adb81afeab6f155f57b9 Mon Sep 17 00:00:00 2001 From: Jacek Lewandowski Date: Fri, 10 Jun 2022 11:43:53 +0200 Subject: [PATCH 039/159] Fix a race condition where a keyspace can be opened while it is being removed MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit patch by Jacek Lewandowski; reviewed by Andrés de la Peña and Ekaterina Dimitrova for CASSANDRA 17658 --- CHANGES.txt | 1 + .../org/apache/cassandra/db/Keyspace.java | 10 +- .../schema/DefaultSchemaUpdateHandler.java | 13 +- .../org/apache/cassandra/schema/Schema.java | 58 ++++--- .../distributed/test/SchemaTest.java | 34 ++++ .../apache/cassandra/cql3/CorruptionTest.java | 21 ++- .../org/apache/cassandra/ServerTestUtils.java | 11 ++ .../cassandra/audit/AuditLoggerAuthTest.java | 7 +- ...ggableScheduledThreadPoolExecutorTest.java | 15 +- .../org/apache/cassandra/cql3/BatchTests.java | 17 +- .../org/apache/cassandra/cql3/CQLTester.java | 6 +- .../org/apache/cassandra/cql3/PagingTest.java | 11 +- .../cassandra/metrics/BatchMetricsTest.java | 26 ++-- .../cassandra/metrics/CQLMetricsTest.java | 21 ++- .../metrics/ClientRequestMetricsTest.java | 23 +-- .../metrics/KeyspaceMetricsTest.java | 36 ++--- .../cassandra/metrics/TableMetricsTest.java | 23 +-- .../apache/cassandra/schema/SchemaTest.java | 41 +++-- .../tools/nodetool/ClientStatsTest.java | 147 +++++++++++------- .../cassandra/transport/CQLUserAuditTest.java | 7 +- 20 files changed, 336 insertions(+), 192 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index ca1db37af3b2..204544b76070 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -28,6 +28,7 @@ * Add guardrail for ALTER TABLE ADD / DROP / REMOVE column operations (CASSANDRA-17495) * Rename DisableFlag class to EnableFlag on guardrails (CASSANDRA-17544) Merged from 4.1: + * Fix a race condition where a keyspace can be oopened while it is being removed (CASSANDRA-17658) * DatabaseDescriptor will set the default failure detector during client initialization (CASSANDRA-17782) * Avoid initializing schema via SystemKeyspace.getPreferredIP() with the BulkLoader tool (CASSANDRA-17740) * Uncomment prepared_statements_cache_size, key_cache_size, counter_cache_size, index_summary_capacity which were diff --git a/src/java/org/apache/cassandra/db/Keyspace.java b/src/java/org/apache/cassandra/db/Keyspace.java index 63c02be5dbc1..d6db700519b4 100644 --- a/src/java/org/apache/cassandra/db/Keyspace.java +++ b/src/java/org/apache/cassandra/db/Keyspace.java @@ -126,7 +126,10 @@ public static boolean isInitialized() public static void setInitialized() { - initialized = true; + synchronized (Schema.instance) + { + initialized = true; + } } /** @@ -137,7 +140,10 @@ public static void setInitialized() @VisibleForTesting public static void unsetInitialized() { - initialized = false; + synchronized (Schema.instance) + { + initialized = false; + } } public static Keyspace open(String keyspaceName) diff --git a/src/java/org/apache/cassandra/schema/DefaultSchemaUpdateHandler.java b/src/java/org/apache/cassandra/schema/DefaultSchemaUpdateHandler.java index 1ccecc60fc83..381b4e5ad9e5 100644 --- a/src/java/org/apache/cassandra/schema/DefaultSchemaUpdateHandler.java +++ b/src/java/org/apache/cassandra/schema/DefaultSchemaUpdateHandler.java @@ -24,7 +24,6 @@ import java.util.Set; import java.util.UUID; import java.util.function.BiConsumer; -import java.util.function.Consumer; import java.util.stream.Collectors; import com.google.common.annotations.VisibleForTesting; @@ -47,7 +46,6 @@ import org.apache.cassandra.service.StorageService; import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.Pair; -import org.apache.cassandra.utils.concurrent.ImmediateFuture; import static org.apache.cassandra.schema.MigrationCoordinator.MAX_OUTSTANDING_VERSION_REQUESTS; @@ -257,12 +255,11 @@ private synchronized SchemaTransformationResult reload() @Override public SchemaTransformationResult reset(boolean local) { - return local - ? reload() - : migrationCoordinator.pullSchemaFromAnyNode() - .flatMap(mutations -> ImmediateFuture.success(applyMutations(mutations))) - .awaitThrowUncheckedOnInterrupt() - .getNow(); + if (local) + return reload(); + + Collection mutations = migrationCoordinator.pullSchemaFromAnyNode().awaitThrowUncheckedOnInterrupt().getNow(); + return applyMutations(mutations); } @Override diff --git a/src/java/org/apache/cassandra/schema/Schema.java b/src/java/org/apache/cassandra/schema/Schema.java index f89f8d51107a..f23370c54989 100644 --- a/src/java/org/apache/cassandra/schema/Schema.java +++ b/src/java/org/apache/cassandra/schema/Schema.java @@ -19,6 +19,7 @@ import java.time.Duration; import java.util.*; +import java.util.function.Consumer; import java.util.function.Supplier; import com.google.common.annotations.VisibleForTesting; @@ -227,15 +228,15 @@ public Keyspace maybeAddKeyspaceInstance(String keyspaceName, Supplier return keyspaceInstances.blockingLoadIfAbsent(keyspaceName, loadFunction); } - public Keyspace maybeRemoveKeyspaceInstance(String keyspaceName, boolean dropData) + private Keyspace maybeRemoveKeyspaceInstance(String keyspaceName, Consumer unloadFunction) { try { - return keyspaceInstances.blockingUnloadIfPresent(keyspaceName, keyspace -> keyspace.unload(dropData)); + return keyspaceInstances.blockingUnloadIfPresent(keyspaceName, unloadFunction); } catch (LoadingMap.UnloadExecutionException e) { - throw new AssertionError("Failed to unload the keyspace " + keyspaceName); + throw new AssertionError("Failed to unload the keyspace " + keyspaceName, e); } } @@ -532,16 +533,6 @@ private synchronized void updateVersion(UUID version) SchemaDiagnostics.versionUpdated(this); } - /** - * Clear all KS/CF metadata and reset version. - */ - public synchronized void clear() - { - distributedKeyspaces.forEach(this::unload); - updateVersion(SchemaConstants.emptyVersion); - SchemaDiagnostics.schemaCleared(this); - } - /** * When we receive {@link SchemaTransformationResult} in a callback invocation, the transformation result includes * pre-transformation and post-transformation schema metadata and versions, and a diff between them. Basically @@ -617,16 +608,18 @@ public SchemaTransformationResult transform(SchemaTransformation transformation, } /** - * Clear all locally stored schema information and reset schema to initial state. + * Clear all locally stored schema information and fetch schema from another node. * Called by user (via JMX) who wants to get rid of schema disagreement. */ - public void resetLocalSchema() + public synchronized void resetLocalSchema() { logger.debug("Clearing local schema..."); updateHandler.clear(); logger.debug("Clearing local schema keyspace instances..."); - clear(); + distributedKeyspaces.forEach(this::unload); + updateVersion(SchemaConstants.emptyVersion); + SchemaDiagnostics.schemaCleared(this); updateHandler.reset(false); logger.info("Local schema reset is complete."); @@ -692,37 +685,40 @@ private void createKeyspace(KeyspaceMetadata keyspace) // we send mutations to the correct set of bootstrapping nodes. Refer CASSANDRA-15433. if (keyspace.params.replication.klass != LocalStrategy.class && Keyspace.isInitialized()) { - PendingRangeCalculatorService.calculatePendingRanges(Keyspace.open(keyspace.name).getReplicationStrategy(), keyspace.name); + PendingRangeCalculatorService.calculatePendingRanges(Keyspace.open(keyspace.name, this, true).getReplicationStrategy(), keyspace.name); } } - private void dropKeyspace(KeyspaceMetadata keyspace, boolean dropData) + private void dropKeyspace(KeyspaceMetadata keyspaceMetadata, boolean dropData) { - SchemaDiagnostics.keyspaceDropping(this, keyspace); + SchemaDiagnostics.keyspaceDropping(this, keyspaceMetadata); boolean initialized = Keyspace.isInitialized(); - Keyspace ks = initialized ? getKeyspaceInstance(keyspace.name) : null; + Keyspace keyspace = initialized ? Keyspace.open(keyspaceMetadata.name, this, false) : null; if (initialized) { - if (ks == null) + if (keyspace == null) return; - keyspace.views.forEach(v -> dropView(ks, v, dropData)); - keyspace.tables.forEach(t -> dropTable(ks, t, dropData)); + keyspaceMetadata.views.forEach(v -> dropView(keyspace, v, dropData)); + keyspaceMetadata.tables.forEach(t -> dropTable(keyspace, t, dropData)); // remove the keyspace from the static instances - maybeRemoveKeyspaceInstance(keyspace.name, dropData); - } + Keyspace unloadedKeyspace = maybeRemoveKeyspaceInstance(keyspaceMetadata.name, ks -> { + ks.unload(dropData); + unload(keyspaceMetadata); + }); + assert unloadedKeyspace == keyspace; - unload(keyspace); - - if (initialized) - { Keyspace.writeOrder.awaitNewBarrier(); } + else + { + unload(keyspaceMetadata); + } - schemaChangeNotifier.notifyKeyspaceDropped(keyspace, dropData); - SchemaDiagnostics.keyspaceDropped(this, keyspace); + schemaChangeNotifier.notifyKeyspaceDropped(keyspaceMetadata, dropData); + SchemaDiagnostics.keyspaceDropped(this, keyspaceMetadata); } private void dropView(Keyspace keyspace, ViewMetadata metadata, boolean dropData) diff --git a/test/distributed/org/apache/cassandra/distributed/test/SchemaTest.java b/test/distributed/org/apache/cassandra/distributed/test/SchemaTest.java index a2ce32f6f373..7b03105063d6 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/SchemaTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/SchemaTest.java @@ -18,10 +18,15 @@ package org.apache.cassandra.distributed.test; +import java.time.Duration; + import org.junit.Test; import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.Feature; +import org.apache.cassandra.schema.Schema; +import org.awaitility.Awaitility; import static org.junit.Assert.assertTrue; @@ -86,4 +91,33 @@ private void selectSilent(Cluster cluster, String name) assertTrue(causeIsUnknownColumn); } } + + @Test + public void schemaReset() throws Throwable + { + try (Cluster cluster = init(Cluster.build(2).withConfig(cfg -> cfg.with(Feature.GOSSIP, Feature.NETWORK)).start())) + { + cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk INT PRIMARY KEY, v TEXT)"); + + assertTrue(cluster.get(1).callOnInstance(() -> Schema.instance.getTableMetadata(KEYSPACE, "tbl") != null)); + assertTrue(cluster.get(2).callOnInstance(() -> Schema.instance.getTableMetadata(KEYSPACE, "tbl") != null)); + + cluster.get(2).shutdown().get(); + + // when schema is removed and there is no other node to fetch it from, node 1 should be left with clean schema + cluster.get(1).runOnInstance(() -> Schema.instance.resetLocalSchema()); + assertTrue(cluster.get(1).callOnInstance(() -> Schema.instance.getTableMetadata(KEYSPACE, "tbl") == null)); + + // when the other node is started, schema should be back in sync + cluster.get(2).startup(); + Awaitility.waitAtMost(Duration.ofMinutes(1)) + .pollDelay(Duration.ofSeconds(1)) + .until(() -> cluster.get(1).callOnInstance(() -> Schema.instance.getTableMetadata(KEYSPACE, "tbl") != null)); + + // when schema is removed and there is a node to fetch it from, node 1 should immediatelly restore the schema + cluster.get(1).runOnInstance(() -> Schema.instance.resetLocalSchema()); + assertTrue(cluster.get(1).callOnInstance(() -> Schema.instance.getTableMetadata(KEYSPACE, "tbl") != null)); + } + } + } diff --git a/test/long/org/apache/cassandra/cql3/CorruptionTest.java b/test/long/org/apache/cassandra/cql3/CorruptionTest.java index 78f587158a30..0ef43a0991ed 100644 --- a/test/long/org/apache/cassandra/cql3/CorruptionTest.java +++ b/test/long/org/apache/cassandra/cql3/CorruptionTest.java @@ -26,7 +26,10 @@ import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; +import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.io.util.File; + +import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; @@ -34,14 +37,12 @@ import com.datastax.driver.core.policies.LoggingRetryPolicy; import com.datastax.driver.core.policies.Policies; import com.datastax.driver.core.utils.Bytes; -import org.apache.cassandra.SchemaLoader; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.io.util.FileWriter; -import org.apache.cassandra.schema.Schema; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.service.EmbeddedCassandraService; -public class CorruptionTest extends SchemaLoader +public class CorruptionTest { private static EmbeddedCassandraService cassandra; @@ -59,10 +60,7 @@ public class CorruptionTest extends SchemaLoader @BeforeClass() public static void setup() throws ConfigurationException, IOException { - Schema.instance.clear(); - - cassandra = new EmbeddedCassandraService(); - cassandra.start(); + cassandra = ServerTestUtils.startEmbeddedCassandraService(); cluster = Cluster.builder().addContactPoint("127.0.0.1") .withRetryPolicy(new LoggingRetryPolicy(Policies.defaultRetryPolicy())) @@ -102,6 +100,15 @@ public static void setup() throws ConfigurationException, IOException VALUE = s.toString(); } + @AfterClass + public static void tearDown() + { + if (cluster != null) + cluster.close(); + if (cassandra != null) + cassandra.stop(); + } + @Test public void runCorruptionTest() { diff --git a/test/unit/org/apache/cassandra/ServerTestUtils.java b/test/unit/org/apache/cassandra/ServerTestUtils.java index e35c8a6d0b7f..10cb08228594 100644 --- a/test/unit/org/apache/cassandra/ServerTestUtils.java +++ b/test/unit/org/apache/cassandra/ServerTestUtils.java @@ -36,6 +36,7 @@ import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.locator.Replica; import org.apache.cassandra.security.ThreadAwareSecurityManager; +import org.apache.cassandra.service.EmbeddedCassandraService; /** * Utility methodes used by SchemaLoader and CQLTester to manage the server and its state. @@ -189,6 +190,16 @@ public static void cleanupSavedCaches() cleanupDirectory(DatabaseDescriptor.getSavedCachesLocation()); } + public static EmbeddedCassandraService startEmbeddedCassandraService() throws IOException + { + DatabaseDescriptor.daemonInitialization(); + mkdirs(); + cleanup(); + EmbeddedCassandraService service = new EmbeddedCassandraService(); + service.start(); + return service; + } + private ServerTestUtils() { } diff --git a/test/unit/org/apache/cassandra/audit/AuditLoggerAuthTest.java b/test/unit/org/apache/cassandra/audit/AuditLoggerAuthTest.java index 71a88e526d12..9a3c605f8e04 100644 --- a/test/unit/org/apache/cassandra/audit/AuditLoggerAuthTest.java +++ b/test/unit/org/apache/cassandra/audit/AuditLoggerAuthTest.java @@ -33,11 +33,10 @@ import com.datastax.driver.core.exceptions.AuthenticationException; import com.datastax.driver.core.exceptions.SyntaxError; import com.datastax.driver.core.exceptions.UnauthorizedException; - +import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.config.OverrideConfigurationLoader; import org.apache.cassandra.config.ParameterizedClass; -import org.apache.cassandra.cql3.CQLTester; import org.apache.cassandra.cql3.PasswordObfuscator; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.service.EmbeddedCassandraService; @@ -75,11 +74,9 @@ public static void setup() throws Exception config.audit_logging_options.enabled = true; config.audit_logging_options.logger = new ParameterizedClass("InMemoryAuditLogger", null); }); - CQLTester.prepareServer(); System.setProperty("cassandra.superuser_setup_delay_ms", "0"); - embedded = new EmbeddedCassandraService(); - embedded.start(); + embedded = ServerTestUtils.startEmbeddedCassandraService(); executeWithCredentials( Arrays.asList(getCreateRoleCql(TEST_USER, true, false, false), diff --git a/test/unit/org/apache/cassandra/concurrent/DebuggableScheduledThreadPoolExecutorTest.java b/test/unit/org/apache/cassandra/concurrent/DebuggableScheduledThreadPoolExecutorTest.java index b37b014f00de..c719d6bca62a 100644 --- a/test/unit/org/apache/cassandra/concurrent/DebuggableScheduledThreadPoolExecutorTest.java +++ b/test/unit/org/apache/cassandra/concurrent/DebuggableScheduledThreadPoolExecutorTest.java @@ -26,10 +26,13 @@ import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicInteger; +import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import org.junit.Assert; + +import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.service.EmbeddedCassandraService; import org.apache.cassandra.service.StorageService; @@ -43,10 +46,14 @@ public class DebuggableScheduledThreadPoolExecutorTest @BeforeClass public static void startup() throws IOException { - //The DSTPE checks for if we are in the service shutdown hook so - //to test it we need to start C* internally. - service = new EmbeddedCassandraService(); - service.start(); + service = ServerTestUtils.startEmbeddedCassandraService(); + } + + @AfterClass + public static void tearDown() + { + if (service != null) + service.stop(); } @Test diff --git a/test/unit/org/apache/cassandra/cql3/BatchTests.java b/test/unit/org/apache/cassandra/cql3/BatchTests.java index 260db4eeed43..f7629e204abc 100644 --- a/test/unit/org/apache/cassandra/cql3/BatchTests.java +++ b/test/unit/org/apache/cassandra/cql3/BatchTests.java @@ -22,15 +22,18 @@ import com.datastax.driver.core.PreparedStatement; import com.datastax.driver.core.Session; import com.datastax.driver.core.exceptions.InvalidQueryException; +import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.service.EmbeddedCassandraService; + +import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import java.io.IOException; -public class BatchTests extends CQLTester +public class BatchTests { private static EmbeddedCassandraService cassandra; @@ -44,8 +47,7 @@ public class BatchTests extends CQLTester @BeforeClass() public static void setup() throws ConfigurationException, IOException { - cassandra = new EmbeddedCassandraService(); - cassandra.start(); + cassandra = ServerTestUtils.startEmbeddedCassandraService(); cluster = Cluster.builder().addContactPoint("127.0.0.1").withPort(DatabaseDescriptor.getNativeTransportPort()).build(); session = cluster.connect(); @@ -75,6 +77,15 @@ public static void setup() throws ConfigurationException, IOException clustering = session.prepare("insert into junit.clustering(id, clustering1, clustering2, clustering3, val) values(?,?,?,?,?)"); } + @AfterClass + public static void tearDown() + { + if (cluster != null) + cluster.close(); + if (cassandra != null) + cassandra.stop(); + } + @Test(expected = InvalidQueryException.class) public void testMixedInCounterBatch() { diff --git a/test/unit/org/apache/cassandra/cql3/CQLTester.java b/test/unit/org/apache/cassandra/cql3/CQLTester.java index 7c2eebb8c56a..f2734891c607 100644 --- a/test/unit/org/apache/cassandra/cql3/CQLTester.java +++ b/test/unit/org/apache/cassandra/cql3/CQLTester.java @@ -429,12 +429,14 @@ public void run() public static List buildNodetoolArgs(List args) { + int port = jmxPort == 0 ? Integer.getInteger("cassandra.jmx.local.port", 7199) : jmxPort; + String host = jmxHost == null ? "127.0.0.1" : jmxHost; List allArgs = new ArrayList<>(); allArgs.add("bin/nodetool"); allArgs.add("-p"); - allArgs.add(Integer.toString(jmxPort)); + allArgs.add(String.valueOf(port)); allArgs.add("-h"); - allArgs.add(jmxHost == null ? "127.0.0.1" : jmxHost); + allArgs.add(host); allArgs.addAll(args); return allArgs; } diff --git a/test/unit/org/apache/cassandra/cql3/PagingTest.java b/test/unit/org/apache/cassandra/cql3/PagingTest.java index 9a95e03210bb..a3387c4e20cf 100644 --- a/test/unit/org/apache/cassandra/cql3/PagingTest.java +++ b/test/unit/org/apache/cassandra/cql3/PagingTest.java @@ -29,6 +29,7 @@ import com.datastax.driver.core.Session; import com.datastax.driver.core.SimpleStatement; import com.datastax.driver.core.Statement; +import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.dht.Murmur3Partitioner.LongToken; @@ -51,13 +52,14 @@ public class PagingTest " WITH REPLICATION = { 'class' : 'SimpleStrategy', 'replication_factor' : 2 };"; private static final String dropKsStatement = "DROP KEYSPACE IF EXISTS " + KEYSPACE; + private static EmbeddedCassandraService cassandra; @BeforeClass public static void setup() throws Exception { System.setProperty("cassandra.config", "cassandra-murmur.yaml"); - EmbeddedCassandraService cassandra = new EmbeddedCassandraService(); - cassandra.start(); + + cassandra = ServerTestUtils.startEmbeddedCassandraService(); // Currently the native server start method return before the server is fully binded to the socket, so we need // to wait slightly before trying to connect to it. We should fix this but in the meantime using a sleep. @@ -75,7 +77,10 @@ public static void setup() throws Exception @AfterClass public static void tearDown() { - cluster.close(); + if (cluster != null) + cluster.close(); + if (cassandra != null) + cassandra.stop(); } /** diff --git a/test/unit/org/apache/cassandra/metrics/BatchMetricsTest.java b/test/unit/org/apache/cassandra/metrics/BatchMetricsTest.java index aa6ad5863659..b90f19a7fc51 100644 --- a/test/unit/org/apache/cassandra/metrics/BatchMetricsTest.java +++ b/test/unit/org/apache/cassandra/metrics/BatchMetricsTest.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.concurrent.TimeUnit; +import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; @@ -28,20 +29,20 @@ import com.datastax.driver.core.Cluster; import com.datastax.driver.core.PreparedStatement; import com.datastax.driver.core.Session; -import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.exceptions.ConfigurationException; -import org.apache.cassandra.schema.Schema; import org.apache.cassandra.service.EmbeddedCassandraService; import static org.apache.cassandra.cql3.statements.BatchStatement.metrics; -import static org.apache.cassandra.metrics.DecayingEstimatedHistogramReservoir.*; +import static org.apache.cassandra.metrics.DecayingEstimatedHistogramReservoir.EstimatedHistogramReservoirSnapshot; +import static org.apache.cassandra.metrics.DecayingEstimatedHistogramReservoir.Range; import static org.junit.Assert.assertEquals; import static org.quicktheories.QuickTheory.qt; import static org.quicktheories.generators.Generate.intArrays; import static org.quicktheories.generators.SourceDSL.integers; -public class BatchMetricsTest extends SchemaLoader +public class BatchMetricsTest { private static final int MAX_ROUNDS_TO_PERFORM = 3; private static final int MAX_DISTINCT_PARTITIONS = 128; @@ -62,13 +63,11 @@ public class BatchMetricsTest extends SchemaLoader @BeforeClass() public static void setup() throws ConfigurationException, IOException { - Schema.instance.clear(); - - cassandra = new EmbeddedCassandraService(); - cassandra.start(); - + DatabaseDescriptor.daemonInitialization(); DatabaseDescriptor.setWriteRpcTimeout(TimeUnit.SECONDS.toMillis(10)); + cassandra = ServerTestUtils.startEmbeddedCassandraService(); + cluster = Cluster.builder().addContactPoint("127.0.0.1").withPort(DatabaseDescriptor.getNativeTransportPort()).build(); session = cluster.connect(); @@ -81,6 +80,15 @@ public static void setup() throws ConfigurationException, IOException psCounter = session.prepare("UPDATE " + KEYSPACE + '.' + COUNTER_TABLE + " SET val = val + 1 WHERE id = ?;"); } + @AfterClass + public static void tearDown() + { + if (cluster != null) + cluster.close(); + if (cassandra != null) + cassandra.stop(); + } + private void executeLoggerBatch(BatchStatement.Type batchStatementType, int distinctPartitions, int statementsPerPartition) { BatchStatement batch = new BatchStatement(batchStatementType); diff --git a/test/unit/org/apache/cassandra/metrics/CQLMetricsTest.java b/test/unit/org/apache/cassandra/metrics/CQLMetricsTest.java index e14861ed3a6b..c41ded37db00 100644 --- a/test/unit/org/apache/cassandra/metrics/CQLMetricsTest.java +++ b/test/unit/org/apache/cassandra/metrics/CQLMetricsTest.java @@ -20,6 +20,7 @@ import java.io.IOException; +import org.junit.AfterClass; import org.junit.Assert; import org.junit.BeforeClass; import org.junit.Test; @@ -28,9 +29,8 @@ import com.datastax.driver.core.PreparedStatement; import com.datastax.driver.core.Session; import com.datastax.driver.core.exceptions.InvalidQueryException; -import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.schema.Schema; import org.apache.cassandra.cql3.QueryProcessor; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.service.EmbeddedCassandraService; @@ -39,18 +39,16 @@ import static org.junit.Assert.assertTrue; import static org.junit.Assert.fail; -public class CQLMetricsTest extends SchemaLoader +public class CQLMetricsTest { private static Cluster cluster; private static Session session; + private static EmbeddedCassandraService cassandra; @BeforeClass() public static void setup() throws ConfigurationException, IOException { - Schema.instance.clear(); - - EmbeddedCassandraService cassandra = new EmbeddedCassandraService(); - cassandra.start(); + cassandra = ServerTestUtils.startEmbeddedCassandraService(); cluster = Cluster.builder().addContactPoint("127.0.0.1").withPort(DatabaseDescriptor.getNativeTransportPort()).build(); session = cluster.connect(); @@ -59,6 +57,15 @@ public static void setup() throws ConfigurationException, IOException session.execute("CREATE TABLE IF NOT EXISTS junit.metricstest (id int PRIMARY KEY, val text);"); } + @AfterClass + public static void tearDown() + { + if (cluster != null) + cluster.close(); + if (cassandra != null) + cassandra.stop(); + } + @Test public void testConnectionWithUseDisabled() { diff --git a/test/unit/org/apache/cassandra/metrics/ClientRequestMetricsTest.java b/test/unit/org/apache/cassandra/metrics/ClientRequestMetricsTest.java index 2982bebec1d3..650bd9563707 100644 --- a/test/unit/org/apache/cassandra/metrics/ClientRequestMetricsTest.java +++ b/test/unit/org/apache/cassandra/metrics/ClientRequestMetricsTest.java @@ -29,16 +29,15 @@ import com.datastax.driver.core.Cluster; import com.datastax.driver.core.PreparedStatement; import com.datastax.driver.core.Session; -import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.exceptions.ConfigurationException; -import org.apache.cassandra.schema.Schema; import org.apache.cassandra.service.EmbeddedCassandraService; -import static com.datastax.driver.core.Cluster.*; +import static com.datastax.driver.core.Cluster.builder; import static org.junit.Assert.assertEquals; -public class ClientRequestMetricsTest extends SchemaLoader +public class ClientRequestMetricsTest { private static Cluster cluster; private static Session session; @@ -54,13 +53,12 @@ public class ClientRequestMetricsTest extends SchemaLoader private static final ClientRequestMetrics readMetrics = ClientRequestsMetricsHolder.readMetrics; private static final ClientWriteRequestMetrics writeMetrics = ClientRequestsMetricsHolder.writeMetrics; + private static EmbeddedCassandraService cassandra; + @BeforeClass public static void setup() throws ConfigurationException, IOException { - Schema.instance.clear(); - - EmbeddedCassandraService cassandra = new EmbeddedCassandraService(); - cassandra.start(); + cassandra = ServerTestUtils.startEmbeddedCassandraService(); cluster = builder().addContactPoint("127.0.0.1").withPort(DatabaseDescriptor.getNativeTransportPort()).build(); session = cluster.connect(); @@ -74,11 +72,14 @@ public static void setup() throws ConfigurationException, IOException readPS = session.prepare("SELECT * FROM " + KEYSPACE + '.' + TABLE + " WHERE id=?;"); readRangePS = session.prepare("SELECT * FROM " + KEYSPACE + '.' + TABLE + " WHERE id=? AND ord>=? AND ord <= ?;"); } - + @AfterClass - public static void teardown() + public static void tearDown() { - cluster.close(); + if (cluster != null) + cluster.close(); + if (cassandra != null) + cassandra.stop(); } @Test diff --git a/test/unit/org/apache/cassandra/metrics/KeyspaceMetricsTest.java b/test/unit/org/apache/cassandra/metrics/KeyspaceMetricsTest.java index e941a84b39d1..7c00da581944 100644 --- a/test/unit/org/apache/cassandra/metrics/KeyspaceMetricsTest.java +++ b/test/unit/org/apache/cassandra/metrics/KeyspaceMetricsTest.java @@ -18,40 +18,37 @@ package org.apache.cassandra.metrics; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertTrue; - import java.io.IOException; import java.util.function.Supplier; import java.util.stream.Collectors; import java.util.stream.Stream; -import org.apache.cassandra.SchemaLoader; -import org.apache.cassandra.config.DatabaseDescriptor; -import org.apache.cassandra.exceptions.ConfigurationException; -import org.apache.cassandra.schema.Schema; -import org.apache.cassandra.service.EmbeddedCassandraService; - import org.junit.AfterClass; import org.junit.BeforeClass; import org.junit.Test; import com.datastax.driver.core.Cluster; import com.datastax.driver.core.Session; +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.service.EmbeddedCassandraService; -public class KeyspaceMetricsTest extends SchemaLoader +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertTrue; + +public class KeyspaceMetricsTest { private static Session session; + private static Cluster cluster; + private static EmbeddedCassandraService cassandra; @BeforeClass public static void setup() throws ConfigurationException, IOException { - Schema.instance.clear(); - - EmbeddedCassandraService cassandra = new EmbeddedCassandraService(); - cassandra.start(); + cassandra = ServerTestUtils.startEmbeddedCassandraService(); - Cluster cluster = Cluster.builder().addContactPoint("127.0.0.1").withPort(DatabaseDescriptor.getNativeTransportPort()).build(); + cluster = Cluster.builder().addContactPoint("127.0.0.1").withPort(DatabaseDescriptor.getNativeTransportPort()).build(); session = cluster.connect(); } @@ -73,10 +70,13 @@ public void testMetricsCleanupOnDrop() // no metrics after drop assertEquals(metrics.get().collect(Collectors.joining(",")), 0, metrics.get().count()); } - + @AfterClass - public static void teardown() + public static void tearDown() { - session.close(); + if (cluster != null) + cluster.close(); + if (cassandra != null) + cassandra.stop(); } } diff --git a/test/unit/org/apache/cassandra/metrics/TableMetricsTest.java b/test/unit/org/apache/cassandra/metrics/TableMetricsTest.java index 1e8175eb34aa..4c9de7720792 100644 --- a/test/unit/org/apache/cassandra/metrics/TableMetricsTest.java +++ b/test/unit/org/apache/cassandra/metrics/TableMetricsTest.java @@ -31,17 +31,16 @@ import com.datastax.driver.core.Cluster; import com.datastax.driver.core.PreparedStatement; import com.datastax.driver.core.Session; -import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.exceptions.ConfigurationException; -import org.apache.cassandra.schema.Schema; import org.apache.cassandra.service.EmbeddedCassandraService; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; -public class TableMetricsTest extends SchemaLoader +public class TableMetricsTest { private static Session session; @@ -49,15 +48,15 @@ public class TableMetricsTest extends SchemaLoader private static final String TABLE = "tablemetricstest"; private static final String COUNTER_TABLE = "tablemetricscountertest"; + private static EmbeddedCassandraService cassandra; + private static Cluster cluster; + @BeforeClass public static void setup() throws ConfigurationException, IOException { - Schema.instance.clear(); - - EmbeddedCassandraService cassandra = new EmbeddedCassandraService(); - cassandra.start(); + cassandra = ServerTestUtils.startEmbeddedCassandraService(); - Cluster cluster = Cluster.builder().addContactPoint("127.0.0.1").withPort(DatabaseDescriptor.getNativeTransportPort()).build(); + cluster = Cluster.builder().addContactPoint("127.0.0.1").withPort(DatabaseDescriptor.getNativeTransportPort()).build(); session = cluster.connect(); session.execute(String.format("CREATE KEYSPACE IF NOT EXISTS %s WITH replication = { 'class' : 'SimpleStrategy', 'replication_factor' : 1 };", KEYSPACE)); @@ -276,9 +275,13 @@ public void testViewMetricsCleanupOnDrop() assertEquals(metrics.get().collect(Collectors.joining(",")), 0, metrics.get().count()); } + @AfterClass - public static void teardown() + public static void tearDown() { - session.close(); + if (cluster != null) + cluster.close(); + if (cassandra != null) + cassandra.stop(); } } diff --git a/test/unit/org/apache/cassandra/schema/SchemaTest.java b/test/unit/org/apache/cassandra/schema/SchemaTest.java index eb2cf044c33a..4185536e68da 100644 --- a/test/unit/org/apache/cassandra/schema/SchemaTest.java +++ b/test/unit/org/apache/cassandra/schema/SchemaTest.java @@ -25,37 +25,33 @@ import org.junit.BeforeClass; import org.junit.Test; -import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.Mutation; -import org.apache.cassandra.db.commitlog.CommitLog; import org.apache.cassandra.gms.Gossiper; import org.apache.cassandra.utils.FBUtilities; -import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; public class SchemaTest { @BeforeClass - public static void setupDatabaseDescriptor() + public static void setup() { DatabaseDescriptor.daemonInitialization(); + ServerTestUtils.prepareServer(); + Schema.instance.loadFromDisk(); } @Test public void testTransKsMigration() throws IOException { - CommitLog.instance.start(); - SchemaLoader.cleanupAndLeaveDirs(); - Schema.instance.loadFromDisk(); assertEquals(0, Schema.instance.getNonSystemKeyspaces().size()); Gossiper.instance.start((int) (System.currentTimeMillis() / 1000)); - Keyspace.setInitialized(); - try { // add a few. @@ -82,6 +78,33 @@ public void testTransKsMigration() throws IOException } } + @Test + public void testKeyspaceCreationWhenNotInitialized() { + Keyspace.unsetInitialized(); + try + { + SchemaTestUtil.addOrUpdateKeyspace(KeyspaceMetadata.create("test", KeyspaceParams.simple(1)), true); + assertNotNull(Schema.instance.getKeyspaceMetadata("test")); + assertNull(Schema.instance.getKeyspaceInstance("test")); + + SchemaTestUtil.dropKeyspaceIfExist("test", true); + assertNull(Schema.instance.getKeyspaceMetadata("test")); + assertNull(Schema.instance.getKeyspaceInstance("test")); + } + finally + { + Keyspace.setInitialized(); + } + + SchemaTestUtil.addOrUpdateKeyspace(KeyspaceMetadata.create("test", KeyspaceParams.simple(1)), true); + assertNotNull(Schema.instance.getKeyspaceMetadata("test")); + assertNotNull(Schema.instance.getKeyspaceInstance("test")); + + SchemaTestUtil.dropKeyspaceIfExist("test", true); + assertNull(Schema.instance.getKeyspaceMetadata("test")); + assertNull(Schema.instance.getKeyspaceInstance("test")); + } + private void saveKeyspaces() { Collection mutations = Arrays.asList(SchemaKeyspace.makeCreateKeyspaceMutation(KeyspaceMetadata.create("ks0", KeyspaceParams.simple(3)), FBUtilities.timestampMicros()).build(), diff --git a/test/unit/org/apache/cassandra/tools/nodetool/ClientStatsTest.java b/test/unit/org/apache/cassandra/tools/nodetool/ClientStatsTest.java index c8a1ae9d1386..5975f66b03b0 100644 --- a/test/unit/org/apache/cassandra/tools/nodetool/ClientStatsTest.java +++ b/test/unit/org/apache/cassandra/tools/nodetool/ClientStatsTest.java @@ -18,95 +18,124 @@ package org.apache.cassandra.tools.nodetool; +import java.net.InetAddress; + +import org.junit.After; +import org.junit.AfterClass; import org.junit.Before; import org.junit.BeforeClass; import org.junit.Test; - import org.slf4j.LoggerFactory; import ch.qos.logback.classic.Level; import ch.qos.logback.classic.Logger; import ch.qos.logback.classic.spi.ILoggingEvent; import ch.qos.logback.core.read.ListAppender; - +import com.datastax.driver.core.Cluster; import com.datastax.driver.core.ResultSet; +import com.datastax.driver.core.Session; +import org.apache.cassandra.ServerTestUtils; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.CQLTester; -import org.apache.cassandra.service.CassandraDaemon; +import org.apache.cassandra.service.EmbeddedCassandraService; import org.apache.cassandra.service.StorageService; import org.apache.cassandra.tools.ToolRunner; -import static org.assertj.core.api.Assertions.assertThat; import org.assertj.core.groups.Tuple; -public class ClientStatsTest extends CQLTester +import static org.assertj.core.api.Assertions.assertThat; + +public class ClientStatsTest { + private static Cluster cluster; + private Session session; + + private static EmbeddedCassandraService cassandra; + @BeforeClass public static void setup() throws Throwable { - CassandraDaemon daemon = new CassandraDaemon(); - requireNetwork(); - startJMXServer(); - daemon.activate(); - daemon.startNativeTransport(); - StorageService.instance.registerDaemon(daemon); + // Since we run EmbeddedCassandraServer, we need to manually associate JMX address; otherwise it won't start + int jmxPort = CQLTester.getAutomaticallyAllocatedPort(InetAddress.getLoopbackAddress()); + System.setProperty("cassandra.jmx.local.port", String.valueOf(jmxPort)); + + cassandra = ServerTestUtils.startEmbeddedCassandraService(); + cluster = Cluster.builder().addContactPoint("127.0.0.1").withPort(DatabaseDescriptor.getNativeTransportPort()).build(); } @Before public void config() throws Throwable { - ResultSet result = executeNet("select release_version from system.local"); + session = cluster.connect(); + ResultSet result = session.execute("select release_version from system.local"); } - + + @After + public void afterTest() + { + if (session != null) + session.close(); + } + + @AfterClass + public static void tearDown() + { + if (cluster != null) + cluster.close(); + if (cassandra != null) + cassandra.stop(); + } + @Test public void testClientStatsHelp() { ToolRunner.ToolResult tool = ToolRunner.invokeNodetool("help", "clientstats"); tool.assertOnCleanExit(); - - String help = "NAME\n" + - " nodetool clientstats - Print information about connected clients\n" + - "\n" + - "SYNOPSIS\n" + - " nodetool [(-h | --host )] [(-p | --port )]\n" + - " [(-pp | --print-port)] [(-pw | --password )]\n" + - " [(-pwf | --password-file )]\n" + - " [(-u | --username )] clientstats [--all]\n" + + + String help = "NAME\n" + + " nodetool clientstats - Print information about connected clients\n" + + "\n" + + "SYNOPSIS\n" + + " nodetool [(-h | --host )] [(-p | --port )]\n" + + " [(-pp | --print-port)] [(-pw | --password )]\n" + + " [(-pwf | --password-file )]\n" + + " [(-u | --username )] clientstats [--all]\n" + " [--by-protocol] [--clear-history] [--client-options]\n" + - "\n" + - "OPTIONS\n" + - " --all\n" + - " Lists all connections\n" + - "\n" + - " --by-protocol\n" + - " Lists most recent client connections by protocol version\n" + - "\n" + - " --clear-history\n" + - " Clear the history of connected clients\n" + + "\n" + + "OPTIONS\n" + + " --all\n" + + " Lists all connections\n" + + "\n" + + " --by-protocol\n" + + " Lists most recent client connections by protocol version\n" + + "\n" + + " --clear-history\n" + + " Clear the history of connected clients\n" + "\n" + " --client-options\n" + " Lists all connections and the client options\n" + - "\n" + - " -h , --host \n" + - " Node hostname or ip address\n" + - "\n" + - " -p , --port \n" + - " Remote jmx agent port number\n" + - "\n" + - " -pp, --print-port\n" + - " Operate in 4.0 mode with hosts disambiguated by port number\n" + - "\n" + - " -pw , --password \n" + - " Remote jmx agent password\n" + - "\n" + - " -pwf , --password-file \n" + - " Path to the JMX password file\n" + - "\n" + - " -u , --username \n" + - " Remote jmx agent username\n" + - "\n" + - "\n"; + "\n" + + " -h , --host \n" + + " Node hostname or ip address\n" + + "\n" + + " -p , --port \n" + + " Remote jmx agent port number\n" + + "\n" + + " -pp, --print-port\n" + + " Operate in 4.0 mode with hosts disambiguated by port number\n" + + "\n" + + " -pw , --password \n" + + " Remote jmx agent password\n" + + "\n" + + " -pwf , --password-file \n" + + " Path to the JMX password file\n" + + "\n" + + " -u , --username \n" + + " Remote jmx agent username\n" + + "\n" + + "\n"; assertThat(tool.getStdout()).isEqualTo(help); } - + @Test public void testClientStats() { @@ -117,7 +146,7 @@ public void testClientStats() assertThat(stdout).contains("User Connections"); assertThat(stdout).contains("anonymous 2"); } - + @Test public void testClientStatsByProtocol() { @@ -128,7 +157,7 @@ public void testClientStatsByProtocol() assertThat(stdout).contains("Protocol-Version IP-Address Last-Seen"); assertThat(stdout).containsPattern("[0-9]/v[0-9] +/127.0.0.1 [a-zA-Z]{3} [0-9]+, [0-9]{4} [0-9]{2}:[0-9]{2}:[0-9]{2}"); } - + @Test public void testClientStatsAll() { @@ -157,7 +186,7 @@ public void testClientStatsClientOptions() assertThat(stdout).contains("User Connections"); assertThat(stdout).contains("anonymous 2"); } - + @Test public void testClientStatsClearHistory() { @@ -166,13 +195,13 @@ public void testClientStatsClearHistory() ssLogger.addAppender(listAppender); listAppender.start(); - + ToolRunner.ToolResult tool = ToolRunner.invokeNodetool("clientstats", "--clear-history"); tool.assertOnCleanExit(); String stdout = tool.getStdout(); assertThat(stdout).contains("Clearing connection history"); assertThat(listAppender.list) - .extracting(ILoggingEvent::getMessage, ILoggingEvent::getLevel) - .contains(Tuple.tuple("Cleared connection history", Level.INFO)); + .extracting(ILoggingEvent::getMessage, ILoggingEvent::getLevel) + .contains(Tuple.tuple("Cleared connection history", Level.INFO)); } } diff --git a/test/unit/org/apache/cassandra/transport/CQLUserAuditTest.java b/test/unit/org/apache/cassandra/transport/CQLUserAuditTest.java index 1c4e41d77456..120de2a3360c 100644 --- a/test/unit/org/apache/cassandra/transport/CQLUserAuditTest.java +++ b/test/unit/org/apache/cassandra/transport/CQLUserAuditTest.java @@ -38,13 +38,13 @@ import com.datastax.driver.core.PreparedStatement; import com.datastax.driver.core.Session; import com.datastax.driver.core.exceptions.AuthenticationException; +import org.apache.cassandra.ServerTestUtils; import org.apache.cassandra.audit.AuditEvent; import org.apache.cassandra.audit.AuditLogEntryType; import org.apache.cassandra.audit.AuditLogManager; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.config.OverrideConfigurationLoader; import org.apache.cassandra.config.ParameterizedClass; -import org.apache.cassandra.cql3.CQLTester; import org.apache.cassandra.diag.DiagnosticEventService; import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.service.EmbeddedCassandraService; @@ -69,11 +69,10 @@ public static void setup() throws Exception config.audit_logging_options.enabled = true; config.audit_logging_options.logger = new ParameterizedClass("DiagnosticEventAuditLogger", null); }); - CQLTester.prepareServer(); System.setProperty("cassandra.superuser_setup_delay_ms", "0"); - embedded = new EmbeddedCassandraService(); - embedded.start(); + + embedded = ServerTestUtils.startEmbeddedCassandraService(); executeAs(Arrays.asList("CREATE ROLE testuser WITH LOGIN = true AND SUPERUSER = false AND PASSWORD = 'foo'", "CREATE ROLE testuser_nologin WITH LOGIN = false AND SUPERUSER = false AND PASSWORD = 'foo'", From c8c8635a4c902ef051a46845919a5430f8d71e3f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andr=C3=A9s=20de=20la=20Pe=C3=B1a?= Date: Fri, 17 Jun 2022 11:45:43 +0100 Subject: [PATCH 040/159] Add ability to read the TTLs and write times of the elements of a collection and/or UDT MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit patch by Andrés de la Peña; reviewed by Yifan Cai and Benjamin Lerer for CASSANDRA-8877 --- CHANGES.txt | 1 + NEWS.txt | 6 +- doc/cql3/CQL.textile | 6 +- doc/modules/cassandra/pages/cql/dml.adoc | 10 +- src/antlr/Parser.g | 12 +- .../selection/AggregateFunctionSelector.java | 6 +- .../cql3/selection/ColumnTimestamps.java | 394 +++++++ .../cql3/selection/ElementsSelector.java | 77 +- .../cql3/selection/FieldSelector.java | 20 +- .../cql3/selection/ListSelector.java | 4 +- .../cassandra/cql3/selection/MapSelector.java | 7 +- .../cql3/selection/ResultSetBuilder.java | 34 +- .../cql3/selection/RowTimestamps.java | 105 ++ .../selection/ScalarFunctionSelector.java | 4 +- .../cassandra/cql3/selection/Selectable.java | 34 +- .../cassandra/cql3/selection/Selection.java | 38 +- .../cassandra/cql3/selection/Selector.java | 202 ++-- .../cassandra/cql3/selection/SetSelector.java | 4 +- .../cql3/selection/SimpleSelector.java | 20 +- .../cql3/selection/TermSelector.java | 2 +- .../cql3/selection/TupleSelector.java | 4 +- .../cql3/selection/UserTypeSelector.java | 4 +- .../selection/WritetimeOrTTLSelector.java | 94 +- .../cql3/statements/SelectStatement.java | 42 +- .../aggregation/AggregationSpecification.java | 34 +- .../cassandra/db/aggregation/GroupMaker.java | 20 +- .../apache/cassandra/db/rows/BTreeRow.java | 7 +- .../org/apache/cassandra/db/rows/Row.java | 8 + .../serializers/AbstractMapSerializer.java | 213 ++++ .../serializers/CollectionSerializer.java | 33 + .../cassandra/serializers/ListSerializer.java | 31 +- .../cassandra/serializers/MapSerializer.java | 89 +- .../cassandra/serializers/SetSerializer.java | 88 +- .../upgrade/MixedModeWritetimeOrTTLTest.java | 119 ++ .../selection/SelectorSerializationTest.java | 8 +- .../validation/entities/CollectionsTest.java | 12 - .../entities/WritetimeOrTTLTest.java | 1022 ++++++++++++++++- .../db/aggregation/GroupMakerTest.java | 3 +- .../serializers/MapSerializerTest.java | 125 ++ .../serializers/SetSerializerTest.java | 121 ++ 40 files changed, 2597 insertions(+), 466 deletions(-) create mode 100644 src/java/org/apache/cassandra/cql3/selection/ColumnTimestamps.java create mode 100644 src/java/org/apache/cassandra/cql3/selection/RowTimestamps.java create mode 100644 src/java/org/apache/cassandra/serializers/AbstractMapSerializer.java create mode 100644 test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeWritetimeOrTTLTest.java create mode 100644 test/unit/org/apache/cassandra/serializers/MapSerializerTest.java create mode 100644 test/unit/org/apache/cassandra/serializers/SetSerializerTest.java diff --git a/CHANGES.txt b/CHANGES.txt index 204544b76070..934efbd8ce8c 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Add ability to read the TTLs and write times of the elements of a collection and/or UDT (CASSANDRA-8877) * Removed Python < 2.7 support from formatting.py (CASSANDRA-17694) * Cleanup pylint issues with pylexotron.py (CASSANDRA-17779) * NPE bug in streaming checking if SSTable is being repaired (CASSANDRA-17801) diff --git a/NEWS.txt b/NEWS.txt index 8d078c078530..96ad4b9ac0c7 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -63,9 +63,11 @@ New features If this is set to false, streaming will be considerably faster however it's possible that, in extreme situations (losing > quorum # nodes in a replica set), you may have data in your SSTables that never makes it to the CDC log. The default is true/enabled. The configuration can be altered via JMX. + - Added support for reading the write times and TTLs of the elements of collections and UDTs, regardless of being + frozen or not. The CQL functions writetime, maxwritetime and ttl can now be applied to entire collections/UDTs, + single collection/UDT elements and slices of collection/UDT elements. - Added a new CQL function, maxwritetime. It shows the largest unix timestamp that the data was written, similar to - its sibling CQL function, writetime. Unlike writetime, maxwritetime can be applied to multi-cell data types, e.g. - non-frozen collections and UDT, and returns the largest timestamp. One should not to use it when upgrading to 4.2. + its sibling CQL function, writetime. - New Guardrails added: - Whether ALTER TABLE commands are allowed to mutate columns - Whether SimpleStrategy is allowed on keyspace creation or alteration diff --git a/doc/cql3/CQL.textile b/doc/cql3/CQL.textile index 5fef1a9a2745..fde597052606 100644 --- a/doc/cql3/CQL.textile +++ b/doc/cql3/CQL.textile @@ -1082,10 +1082,10 @@ bc(syntax).. ::= | - | WRITETIME '(' ')' - | MAXWRITETIME '(' ')' + | WRITETIME '(' ')' + | MAXWRITETIME '(' ')' | COUNT '(' '*' ')' - | TTL '(' ')' + | TTL '(' ')' | CAST '(' AS ')' | '(' ( (',' )*)? ')' | '.' diff --git a/doc/modules/cassandra/pages/cql/dml.adoc b/doc/modules/cassandra/pages/cql/dml.adoc index af9dbba2d90d..513dc1d1e564 100644 --- a/doc/modules/cassandra/pages/cql/dml.adoc +++ b/doc/modules/cassandra/pages/cql/dml.adoc @@ -79,17 +79,17 @@ You must use the orignal column name instead. Selection supports three special functions that aren't allowed anywhere else: `WRITETIME`, `MAXWRITETIME` and `TTL`. -All functions take only one argument, a column name. +All functions take only one argument, a column name. If the column is a collection or UDT, it's possible to add element +selectors, such as `WRITETTIME(phones[2..4])` or `WRITETTIME(user.name)`. These functions retrieve meta-information that is stored internally for each column: -* `WRITETIME` stores the timestamp of the value of the column. Note that this function cannot be applied to non-frozen collection -and UDT. +* `WRITETIME` stores the timestamp of the value of the column. * `MAXWRITETIME` stores the largest timestamp of the value of the column. For non-collection and non-UDT columns, `MAXWRITETIME` is equivalent to `WRITETIME`. In the other cases, it returns the largest timestamp of the values in the column. * `TTL` stores the remaining time to live (in seconds) for the value of the column if it is set to expire; otherwise the value is `null`. -The `WRITETIME` and `TTL` functions can't be used on multi-cell columns such as non-frozen -collections or non-frozen user-defined types. +The `WRITETIME` and `TTL` functions can be used on multi-cell columns such as non-frozen collections or non-frozen +user-defined types. In that case, the functions will return the list of timestamps or TTLs for each selected cell. [[where-clause]] === The `WHERE` clause diff --git a/src/antlr/Parser.g b/src/antlr/Parser.g index 2643e0a6b567..b349e1652755 100644 --- a/src/antlr/Parser.g +++ b/src/antlr/Parser.g @@ -414,12 +414,12 @@ simpleUnaliasedSelector returns [Selectable.Raw s] ; selectionFunction returns [Selectable.Raw s] - : K_COUNT '(' '\*' ')' { $s = Selectable.WithFunction.Raw.newCountRowsFunction(); } - | K_MAXWRITETIME '(' c=sident ')' { $s = new Selectable.WritetimeOrTTL.Raw(c, Selectable.WritetimeOrTTL.Kind.MAX_WRITE_TIME); } - | K_WRITETIME '(' c=sident ')' { $s = new Selectable.WritetimeOrTTL.Raw(c, Selectable.WritetimeOrTTL.Kind.WRITE_TIME); } - | K_TTL '(' c=sident ')' { $s = new Selectable.WritetimeOrTTL.Raw(c, Selectable.WritetimeOrTTL.Kind.TTL); } - | K_CAST '(' sn=unaliasedSelector K_AS t=native_type ')' {$s = new Selectable.WithCast.Raw(sn, t);} - | f=functionName args=selectionFunctionArgs { $s = new Selectable.WithFunction.Raw(f, args); } + : K_COUNT '(' '\*' ')' { $s = Selectable.WithFunction.Raw.newCountRowsFunction(); } + | K_MAXWRITETIME '(' c=sident m=selectorModifier[c] ')' { $s = new Selectable.WritetimeOrTTL.Raw(c, m, Selectable.WritetimeOrTTL.Kind.MAX_WRITE_TIME); } + | K_WRITETIME '(' c=sident m=selectorModifier[c] ')' { $s = new Selectable.WritetimeOrTTL.Raw(c, m, Selectable.WritetimeOrTTL.Kind.WRITE_TIME); } + | K_TTL '(' c=sident m=selectorModifier[c] ')' { $s = new Selectable.WritetimeOrTTL.Raw(c, m, Selectable.WritetimeOrTTL.Kind.TTL); } + | K_CAST '(' sn=unaliasedSelector K_AS t=native_type ')' { $s = new Selectable.WithCast.Raw(sn, t);} + | f=functionName args=selectionFunctionArgs { $s = new Selectable.WithFunction.Raw(f, args); } ; selectionLiteral returns [Term.Raw value] diff --git a/src/java/org/apache/cassandra/cql3/selection/AggregateFunctionSelector.java b/src/java/org/apache/cassandra/cql3/selection/AggregateFunctionSelector.java index 8c4f74567b79..8d21c1e9ef09 100644 --- a/src/java/org/apache/cassandra/cql3/selection/AggregateFunctionSelector.java +++ b/src/java/org/apache/cassandra/cql3/selection/AggregateFunctionSelector.java @@ -43,13 +43,15 @@ public boolean isAggregate() return true; } - public void addInput(ProtocolVersion protocolVersion, InputRow input) + public void addInput(InputRow input) { + ProtocolVersion protocolVersion = input.getProtocolVersion(); + // Aggregation of aggregation is not supported for (int i = 0, m = argSelectors.size(); i < m; i++) { Selector s = argSelectors.get(i); - s.addInput(protocolVersion, input); + s.addInput(input); setArg(i, s.getOutput(protocolVersion)); s.reset(); } diff --git a/src/java/org/apache/cassandra/cql3/selection/ColumnTimestamps.java b/src/java/org/apache/cassandra/cql3/selection/ColumnTimestamps.java new file mode 100644 index 000000000000..6a08076d5b22 --- /dev/null +++ b/src/java/org/apache/cassandra/cql3/selection/ColumnTimestamps.java @@ -0,0 +1,394 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.cql3.selection; + +import java.nio.ByteBuffer; +import java.util.ArrayList; +import java.util.Collections; +import java.util.List; + +import com.google.common.collect.BoundType; +import com.google.common.collect.Range; + +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.UserType; +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.serializers.CollectionSerializer; +import org.apache.cassandra.transport.ProtocolVersion; +import org.apache.cassandra.utils.ByteBufferUtil; + +/** + * Represents a list of timestamps associated to a CQL column. Those timestamps can either be writetimes or TTLs, + * according to {@link TimestampsType}. + */ +abstract class ColumnTimestamps +{ + /** + * The timestamps type. + */ + protected final TimestampsType type; + + protected ColumnTimestamps(TimestampsType type) + { + this.type = type; + } + + /** + * @return the timestamps type + */ + public TimestampsType type() + { + return type; + } + + /** + * Retrieves the timestamps at the specified position. + * + * @param index the timestamps position + * @return the timestamps at the specified position or a {@link #NO_TIMESTAMP} + */ + public abstract ColumnTimestamps get(int index); + + public abstract ColumnTimestamps max(); + + /** + * Returns a view of the portion of the timestamps within the specified range. + * + * @param range the indexes range + * @return a view of the specified range within this {@link ColumnTimestamps} + */ + public abstract ColumnTimestamps slice(Range range); + + /** + * Converts the timestamps into their serialized form. + * + * @param protocolVersion the protocol version to use for the serialization + * @return the serialized timestamps + */ + public abstract ByteBuffer toByteBuffer(ProtocolVersion protocolVersion); + + /** + * Appends an empty timestamp at the end of this list. + */ + public abstract void addNoTimestamp(); + + /** + * Appends the timestamp of the specified cell at the end of this list. + */ + public abstract void addTimestampFrom(Cell cell, int nowInSecond); + + /** + * Creates a new {@link ColumnTimestamps} instance for the specified column type. + * + * @param timestampType the timestamps type + * @param columnType the column type + * @return a {@link ColumnTimestamps} instance for the specified column type + */ + static ColumnTimestamps newTimestamps(TimestampsType timestampType, AbstractType columnType) + { + if (!columnType.isMultiCell()) + return new SingleTimestamps(timestampType); + + // For UserType we know that the size will not change, so we can initialize the array with the proper capacity. + if (columnType instanceof UserType) + return new MultipleTimestamps(timestampType, ((UserType) columnType).size()); + + return new MultipleTimestamps(timestampType, 0); + } + + /** + * The type of represented timestamps. + */ + public enum TimestampsType + { + WRITETIMES + { + @Override + long getTimestamp(Cell cell, int nowInSecond) + { + return cell.timestamp(); + } + + @Override + long defaultValue() + { + return Long.MIN_VALUE; + } + + @Override + ByteBuffer toByteBuffer(long timestamp) + { + return timestamp == defaultValue() ? null : ByteBufferUtil.bytes(timestamp); + } + }, + TTLS + { + @Override + long getTimestamp(Cell cell, int nowInSecond) + { + if (!cell.isExpiring()) + return defaultValue(); + + int remaining = cell.localDeletionTime() - nowInSecond; + return remaining >= 0 ? remaining : defaultValue(); + } + + @Override + long defaultValue() + { + return -1; + } + + @Override + ByteBuffer toByteBuffer(long timestamp) + { + return timestamp == defaultValue() ? null : ByteBufferUtil.bytes((int) timestamp); + } + }; + + /** + * Extracts the timestamp from the specified cell. + * + * @param cell the cell + * @param nowInSecond the query timestamp insecond + * @return the timestamp corresponding to this type + */ + abstract long getTimestamp(Cell cell, int nowInSecond); + + /** + * Returns the value to use when there is no timestamp. + * + * @return the value to use when there is no timestamp + */ + abstract long defaultValue(); + + /** + * Serializes the specified timestamp. + * + * @param timestamp the timestamp to serialize + * @return the bytes corresponding to the specified timestamp + */ + abstract ByteBuffer toByteBuffer(long timestamp); + } + + /** + * A {@link ColumnTimestamps} that doesn't contain any timestamps. + */ + static final ColumnTimestamps NO_TIMESTAMP = new ColumnTimestamps(null) + { + @Override + public ColumnTimestamps get(int index) + { + return this; + } + + @Override + public ColumnTimestamps max() + { + return this; + } + + @Override + public ColumnTimestamps slice(Range range) + { + return this; + } + + @Override + public ByteBuffer toByteBuffer(ProtocolVersion protocolVersion) + { + return null; + } + + @Override + public void addNoTimestamp() + { + throw new UnsupportedOperationException(); + } + + @Override + public void addTimestampFrom(Cell cell, int nowInSecond) + { + throw new UnsupportedOperationException(); + } + + @Override + public String toString() + { + return "no timestamp"; + } + }; + + /** + * A {@link ColumnTimestamps} that can contains a single timestamp (for columns that aren't multicell). + */ + private static class SingleTimestamps extends ColumnTimestamps + { + protected long timestamp; + + public SingleTimestamps(TimestampsType type) + { + this(type, type.defaultValue()); + } + + public SingleTimestamps(TimestampsType type, long timestamp) + { + super(type); + this.timestamp = timestamp; + } + + @Override + public void addNoTimestamp() + { + timestamp = type.defaultValue(); + } + + @Override + public void addTimestampFrom(Cell cell, int nowInSecond) + { + timestamp = type.getTimestamp(cell, nowInSecond); + } + + @Override + public ColumnTimestamps get(int index) + { + // If this method is called it means that it is an element selection on a frozen collection/UDT, + // so we can safely return this Timestamps as all the elements also share that timestamp + return this; + } + + @Override + public ColumnTimestamps max() + { + return this; + } + + @Override + public ColumnTimestamps slice(Range range) + { + return range.isEmpty() ? NO_TIMESTAMP : this; + } + + @Override + public ByteBuffer toByteBuffer(ProtocolVersion protocolVersion) + { + return timestamp == type.defaultValue() ? null : type.toByteBuffer(timestamp); + } + + @Override + public String toString() + { + return type + ": " + timestamp; + } + } + + /** + * A {@link ColumnTimestamps} that can contain multiple timestamps (for unfrozen collections or UDTs). + */ + private static final class MultipleTimestamps extends ColumnTimestamps + { + private final List timestamps; + + public MultipleTimestamps(TimestampsType type, int initialCapacity) + { + this(type, new ArrayList<>(initialCapacity)); + } + + public MultipleTimestamps(TimestampsType type, List timestamps) + { + super(type); + this.timestamps = timestamps; + } + + @Override + public void addNoTimestamp() + { + timestamps.add(type.defaultValue()); + } + + @Override + public void addTimestampFrom(Cell cell, int nowInSecond) + { + timestamps.add(type.getTimestamp(cell, nowInSecond)); + } + + @Override + public ColumnTimestamps get(int index) + { + return timestamps.isEmpty() + ? NO_TIMESTAMP + : new SingleTimestamps(type, timestamps.get(index)); + } + + @Override + public ColumnTimestamps max() + { + return timestamps.isEmpty() + ? NO_TIMESTAMP + : new SingleTimestamps(type, Collections.max(timestamps)); + } + + @Override + public ColumnTimestamps slice(Range range) + { + if (range.isEmpty()) + return NO_TIMESTAMP; + + // Prepare the "from" argument for the call to List#sublist below. That argument is always specified and + // inclusive, whereas the range lower bound can be open, closed or not specified. + int from = 0; + if (range.hasLowerBound()) + { + from = range.lowerBoundType() == BoundType.CLOSED + ? range.lowerEndpoint() // inclusive range lower bound, inclusive "from" is the same list position + : range.lowerEndpoint() + 1; // exclusive range lower bound, inclusive "from" is the next list position + } + + // Prepare the "to" argument for the call to List#sublist below. That argument is always specified and + // exclusive, whereas the range upper bound can be open, closed or not specified. + int to = timestamps.size(); + if (range.hasUpperBound()) + { + to = range.upperBoundType() == BoundType.CLOSED + ? range.upperEndpoint() + 1 // inclusive range upper bound, exclusive "to" is the next list position + : range.upperEndpoint(); // exclusive range upper bound, exclusive "to" is the same list position + } + + return new MultipleTimestamps(type, timestamps.subList(from, to)); + } + + @Override + public ByteBuffer toByteBuffer(ProtocolVersion protocolVersion) + { + if (timestamps.isEmpty()) + return null; + + List buffers = new ArrayList<>(timestamps.size()); + timestamps.forEach(timestamp -> buffers.add(type.toByteBuffer(timestamp))); + + return CollectionSerializer.pack(buffers, timestamps.size(), protocolVersion); + } + + @Override + public String toString() + { + return type + ": " + timestamps; + } + } +} diff --git a/src/java/org/apache/cassandra/cql3/selection/ElementsSelector.java b/src/java/org/apache/cassandra/cql3/selection/ElementsSelector.java index 851d78574c8c..930fd83f9451 100644 --- a/src/java/org/apache/cassandra/cql3/selection/ElementsSelector.java +++ b/src/java/org/apache/cassandra/cql3/selection/ElementsSelector.java @@ -21,6 +21,7 @@ import java.nio.ByteBuffer; import com.google.common.base.Objects; +import com.google.common.collect.Range; import org.apache.cassandra.cql3.ColumnSpecification; import org.apache.cassandra.cql3.QueryOptions; @@ -43,12 +44,19 @@ */ abstract class ElementsSelector extends Selector { + /** + * An empty collection is composed of an int size of zero. + */ + private static final ByteBuffer EMPTY_FROZEN_COLLECTION = ByteBufferUtil.bytes(0); + protected final Selector selected; + protected final CollectionType type; protected ElementsSelector(Kind kind,Selector selected) { super(kind); this.selected = selected; + this.type = (CollectionType) selected.getType(); } private static boolean isUnset(ByteBuffer bb) @@ -226,9 +234,14 @@ public ByteBuffer getOutput(ProtocolVersion protocolVersion) protected abstract ByteBuffer extractSelection(ByteBuffer collection); - public void addInput(ProtocolVersion protocolVersion, InputRow input) + public void addInput(InputRow input) { - selected.addInput(protocolVersion, input); + selected.addInput(input); + } + + protected Range getIndexRange(ByteBuffer output, ByteBuffer fromKey, ByteBuffer toKey) + { + return type.getSerializer().getIndexesRangeFromSerialized(output, fromKey, toKey, keyType(type)); } public void reset() @@ -255,14 +268,12 @@ protected Selector deserialize(DataInputPlus in, int version, TableMetadata meta } }; - private final CollectionType type; private final ByteBuffer key; private ElementSelector(Selector selected, ByteBuffer key) { super(Kind.ELEMENT_SELECTOR, selected); assert selected.getType() instanceof MapType || selected.getType() instanceof SetType : "this shouldn't have passed validation in Selectable"; - this.type = (CollectionType) selected.getType(); this.key = key; } @@ -284,6 +295,31 @@ protected ByteBuffer extractSelection(ByteBuffer collection) return type.getSerializer().getSerializedValue(collection, key, keyType(type)); } + protected int getElementIndex(ProtocolVersion protocolVersion, ByteBuffer key) + { + ByteBuffer output = selected.getOutput(protocolVersion); + return output == null ? -1 : type.getSerializer().getIndexFromSerialized(output, key, keyType(type)); + } + + @Override + protected ColumnTimestamps getWritetimes(ProtocolVersion protocolVersion) + { + return getElementTimestamps(protocolVersion, selected.getWritetimes(protocolVersion)); + } + + @Override + protected ColumnTimestamps getTTLs(ProtocolVersion protocolVersion) + { + return getElementTimestamps(protocolVersion, selected.getTTLs(protocolVersion)); + } + + private ColumnTimestamps getElementTimestamps(ProtocolVersion protocolVersion, + ColumnTimestamps timestamps) + { + int index = getElementIndex(protocolVersion, key); + return index == -1 ? ColumnTimestamps.NO_TIMESTAMP : timestamps.get(index); + } + public AbstractType getType() { return valueType(type); @@ -348,8 +384,6 @@ protected Selector deserialize(DataInputPlus in, int version, TableMetadata meta } }; - private final CollectionType type; - // Note that neither from nor to can be null, but they can both be ByteBufferUtil.UNSET_BYTE_BUFFER to represent no particular bound private final ByteBuffer from; private final ByteBuffer to; @@ -359,7 +393,6 @@ private SliceSelector(Selector selected, ByteBuffer from, ByteBuffer to) super(Kind.SLICE_SELECTOR, selected); assert selected.getType() instanceof MapType || selected.getType() instanceof SetType : "this shouldn't have passed validation in Selectable"; assert from != null && to != null : "We can have unset buffers, but not nulls"; - this.type = (CollectionType) selected.getType(); this.from = from; this.to = to; } @@ -382,6 +415,36 @@ protected ByteBuffer extractSelection(ByteBuffer collection) return type.getSerializer().getSliceFromSerialized(collection, from, to, type.nameComparator(), type.isFrozenCollection()); } + @Override + protected ColumnTimestamps getWritetimes(ProtocolVersion protocolVersion) + { + return getTimestampsSlice(protocolVersion, selected.getWritetimes(protocolVersion)); + } + + @Override + protected ColumnTimestamps getTTLs(ProtocolVersion protocolVersion) + { + return getTimestampsSlice(protocolVersion, selected.getTTLs(protocolVersion)); + } + + protected ColumnTimestamps getTimestampsSlice(ProtocolVersion protocolVersion, ColumnTimestamps timestamps) + { + ByteBuffer output = selected.getOutput(protocolVersion); + return (output == null || isCollectionEmpty(output)) + ? ColumnTimestamps.NO_TIMESTAMP + : timestamps.slice(getIndexRange(output, from, to) ); + } + + /** + * Checks if the collection is empty. Only frozen collection can be empty. + * @param output the serialized collection + * @return {@code true} if the collection is empty {@code false} otherwise. + */ + private boolean isCollectionEmpty(ByteBuffer output) + { + return EMPTY_FROZEN_COLLECTION.equals(output); + } + public AbstractType getType() { return type; diff --git a/src/java/org/apache/cassandra/cql3/selection/FieldSelector.java b/src/java/org/apache/cassandra/cql3/selection/FieldSelector.java index ddcc868cf3a1..043c3ee0c9d2 100644 --- a/src/java/org/apache/cassandra/cql3/selection/FieldSelector.java +++ b/src/java/org/apache/cassandra/cql3/selection/FieldSelector.java @@ -99,9 +99,9 @@ public void addFetchedColumns(ColumnFilter.Builder builder) selected.addFetchedColumns(builder); } - public void addInput(ProtocolVersion protocolVersion, InputRow input) + public void addInput(InputRow input) { - selected.addInput(protocolVersion, input); + selected.addInput(input); } public ByteBuffer getOutput(ProtocolVersion protocolVersion) @@ -113,6 +113,22 @@ public ByteBuffer getOutput(ProtocolVersion protocolVersion) return field < buffers.length ? buffers[field] : null; } + @Override + protected ColumnTimestamps getWritetimes(ProtocolVersion protocolVersion) + { + return getOutput(protocolVersion) == null + ? ColumnTimestamps.NO_TIMESTAMP + : selected.getWritetimes(protocolVersion).get(field); + } + + @Override + protected ColumnTimestamps getTTLs(ProtocolVersion protocolVersion) + { + return getOutput(protocolVersion) == null + ? ColumnTimestamps.NO_TIMESTAMP + : selected.getTTLs(protocolVersion).get(field); + } + public AbstractType getType() { return type.fieldType(field); diff --git a/src/java/org/apache/cassandra/cql3/selection/ListSelector.java b/src/java/org/apache/cassandra/cql3/selection/ListSelector.java index 9136ab2c5b2f..a99822ec0c52 100644 --- a/src/java/org/apache/cassandra/cql3/selection/ListSelector.java +++ b/src/java/org/apache/cassandra/cql3/selection/ListSelector.java @@ -89,10 +89,10 @@ public void addFetchedColumns(Builder builder) elements.get(i).addFetchedColumns(builder); } - public void addInput(ProtocolVersion protocolVersion, InputRow input) + public void addInput(InputRow input) { for (int i = 0, m = elements.size(); i < m; i++) - elements.get(i).addInput(protocolVersion, input); + elements.get(i).addInput(input); } public ByteBuffer getOutput(ProtocolVersion protocolVersion) diff --git a/src/java/org/apache/cassandra/cql3/selection/MapSelector.java b/src/java/org/apache/cassandra/cql3/selection/MapSelector.java index a43d7bfccbf9..41344a6eae7b 100644 --- a/src/java/org/apache/cassandra/cql3/selection/MapSelector.java +++ b/src/java/org/apache/cassandra/cql3/selection/MapSelector.java @@ -35,7 +35,6 @@ import org.apache.cassandra.db.filter.ColumnFilter.Builder; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.MapType; -import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.schema.ColumnMetadata; @@ -194,13 +193,13 @@ public void addFetchedColumns(Builder builder) } } - public void addInput(ProtocolVersion protocolVersion, InputRow input) + public void addInput(InputRow input) { for (int i = 0, m = elements.size(); i < m; i++) { Pair pair = elements.get(i); - pair.left.addInput(protocolVersion, input); - pair.right.addInput(protocolVersion, input); + pair.left.addInput(input); + pair.right.addInput(input); } } diff --git a/src/java/org/apache/cassandra/cql3/selection/ResultSetBuilder.java b/src/java/org/apache/cassandra/cql3/selection/ResultSetBuilder.java index 3e652dfeb4ec..37e877261619 100644 --- a/src/java/org/apache/cassandra/cql3/selection/ResultSetBuilder.java +++ b/src/java/org/apache/cassandra/cql3/selection/ResultSetBuilder.java @@ -31,7 +31,10 @@ import org.apache.cassandra.db.DecoratedKey; import org.apache.cassandra.db.aggregation.GroupMaker; import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.ColumnData; import org.apache.cassandra.db.rows.ComplexColumnData; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.transport.ProtocolVersion; public final class ResultSetBuilder { @@ -102,30 +105,14 @@ public void add(ByteBuffer v) inputRow.add(v); } - public void add(ComplexColumnData complexColumnData, Function>, ByteBuffer> serializer) + public void add(Cell c, int nowInSec) { - if (complexColumnData == null) - { - inputRow.add(null); - return; - } - - long timestamp = -1L; - if (selectors.collectMaxTimestamps()) - { - Iterator> cells = complexColumnData.iterator(); - while (cells.hasNext()) - { - timestamp = Math.max(timestamp, cells.next().timestamp()); - } - } - - inputRow.add(serializer.apply(complexColumnData.iterator()), timestamp, -1); + inputRow.add(c, nowInSec); } - public void add(Cell c, int nowInSec) + public void add(ColumnData columnData, int nowInSec) { - inputRow.add(c, nowInSec); + inputRow.add(columnData, nowInSec); } /** @@ -134,7 +121,7 @@ public void add(Cell c, int nowInSec) * @param partitionKey the partition key of the new row * @param clustering the clustering of the new row */ - public void newRow(DecoratedKey partitionKey, Clustering clustering) + public void newRow(ProtocolVersion protocolVersion, DecoratedKey partitionKey, Clustering clustering, List columns) { // The groupMaker needs to be called for each row boolean isNewAggregate = groupMaker == null || groupMaker.isNewGroup(partitionKey, clustering); @@ -154,7 +141,10 @@ public void newRow(DecoratedKey partitionKey, Clustering clustering) } else { - inputRow = new Selector.InputRow(selectors.numberOfFetchedColumns(), selectors.collectTimestamps(), selectors.collectTTLs()); + inputRow = new Selector.InputRow(protocolVersion, + columns, + selectors.collectWritetimes(), + selectors.collectTTLs()); } } diff --git a/src/java/org/apache/cassandra/cql3/selection/RowTimestamps.java b/src/java/org/apache/cassandra/cql3/selection/RowTimestamps.java new file mode 100644 index 000000000000..24d23ee1dc7b --- /dev/null +++ b/src/java/org/apache/cassandra/cql3/selection/RowTimestamps.java @@ -0,0 +1,105 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.cql3.selection; + +import java.util.List; + +import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.schema.ColumnMetadata; + +/** + * The {@link ColumnTimestamps} associated to the given set of columns of a row. + */ +interface RowTimestamps +{ + /** + * Adds an empty timestamp for the specified column. + * + * @param index the column index + */ + void addNoTimestamp(int index); + + /** + * Adds the timestamp of the specified cell. + * + * @param index the column index + * @param cell the cell to get the timestamp from + * @param nowInSec the query timestamp in second + */ + void addTimestamp(int index, Cell cell, int nowInSec); + + /** + * Returns the timestamp of the specified column. + * + * @param index the column index + * @return the timestamp of the specified column + */ + ColumnTimestamps get(int index); + + /** + * A {@code RowTimestamps} that does nothing. + */ + RowTimestamps NOOP_ROW_TIMESTAMPS = new RowTimestamps() + { + @Override + public void addNoTimestamp(int index) + { + } + + @Override + public void addTimestamp(int index, Cell cell, int nowInSec) + { + } + + @Override + public ColumnTimestamps get(int index) + { + return ColumnTimestamps.NO_TIMESTAMP; + } + }; + + static RowTimestamps newInstance(ColumnTimestamps.TimestampsType type, List columns) + { + final ColumnTimestamps[] array = new ColumnTimestamps[columns.size()]; + + for (int i = 0, m = columns.size(); i < m; i++) + array[i] = ColumnTimestamps.newTimestamps(type, columns.get(i).type); + + return new RowTimestamps() + { + @Override + public void addNoTimestamp(int index) + { + array[index].addNoTimestamp(); + } + + @Override + public void addTimestamp(int index, Cell cell, int nowInSec) + { + array[index].addTimestampFrom(cell, nowInSec); + } + + @Override + public ColumnTimestamps get(int index) + { + return array[index]; + } + }; + } +} diff --git a/src/java/org/apache/cassandra/cql3/selection/ScalarFunctionSelector.java b/src/java/org/apache/cassandra/cql3/selection/ScalarFunctionSelector.java index ed2a1406be59..5e9711a9e7f9 100644 --- a/src/java/org/apache/cassandra/cql3/selection/ScalarFunctionSelector.java +++ b/src/java/org/apache/cassandra/cql3/selection/ScalarFunctionSelector.java @@ -37,12 +37,12 @@ protected Selector newFunctionSelector(Function function, List argSele } }; - public void addInput(ProtocolVersion protocolVersion, InputRow input) + public void addInput(InputRow input) { for (int i = 0, m = argSelectors.size(); i < m; i++) { Selector s = argSelectors.get(i); - s.addInput(protocolVersion, input); + s.addInput(input); } } diff --git a/src/java/org/apache/cassandra/cql3/selection/Selectable.java b/src/java/org/apache/cassandra/cql3/selection/Selectable.java index 4210f9cf3cf1..88e70f8ce3ac 100644 --- a/src/java/org/apache/cassandra/cql3/selection/Selectable.java +++ b/src/java/org/apache/cassandra/cql3/selection/Selectable.java @@ -243,25 +243,27 @@ public static Kind fromOrdinal(int ordinal) this.returnType = returnType; } - public boolean allowedForMultiCell() + public boolean aggregatesMultiCell() { return this == MAX_WRITE_TIME; } } public final ColumnMetadata column; + public final Selectable selectable; public final Kind kind; - public WritetimeOrTTL(ColumnMetadata column, Kind kind) + public WritetimeOrTTL(ColumnMetadata column, Selectable selectable, Kind kind) { this.column = column; + this.selectable = selectable; this.kind = kind; } @Override public String toString() { - return kind.name + "(" + column.name + ")"; + return kind.name + "(" + selectable + ")"; } public Selector.Factory newSelectorFactory(TableMetadata table, @@ -275,42 +277,42 @@ public Selector.Factory newSelectorFactory(TableMetadata table, kind.name, column.name)); - // only maxwritetime is allowed for multicell types - if (column.type.isMultiCell() && !kind.allowedForMultiCell()) - throw new InvalidRequestException(String.format("Cannot use selection function %s on non-frozen %s %s", - kind.name, - column.type.isCollection() ? "collection" : "UDT", - column.name)); + Selector.Factory factory = selectable.newSelectorFactory(table, expectedType, defs, boundNames); + boolean isMultiCell = factory.getColumnSpecification(table).type.isMultiCell(); - return WritetimeOrTTLSelector.newFactory(column, addAndGetIndex(column, defs), kind); + return WritetimeOrTTLSelector.newFactory(factory, addAndGetIndex(column, defs), kind, isMultiCell); } + @Override public AbstractType getExactTypeIfKnown(String keyspace) { - return kind.returnType; + AbstractType type = kind.returnType; + return column.type.isMultiCell() && !kind.aggregatesMultiCell() ? ListType.getInstance(type, false) : type; } @Override public boolean selectColumns(Predicate predicate) { - return predicate.test(column); + return selectable.selectColumns(predicate); } public static class Raw implements Selectable.Raw { - private final Selectable.RawIdentifier id; + private final Selectable.RawIdentifier column; + private final Selectable.Raw selected; private final Kind kind; - public Raw(Selectable.RawIdentifier id, Kind kind) + public Raw(Selectable.RawIdentifier column, Selectable.Raw selected, Kind kind) { - this.id = id; + this.column = column; + this.selected = selected; this.kind = kind; } @Override public WritetimeOrTTL prepare(TableMetadata table) { - return new WritetimeOrTTL(id.prepare(table), kind); + return new WritetimeOrTTL(column.prepare(table), selected.prepare(table), kind); } } } diff --git a/src/java/org/apache/cassandra/cql3/selection/Selection.java b/src/java/org/apache/cassandra/cql3/selection/Selection.java index 2f41192c373e..866497a16287 100644 --- a/src/java/org/apache/cassandra/cql3/selection/Selection.java +++ b/src/java/org/apache/cassandra/cql3/selection/Selection.java @@ -371,16 +371,10 @@ public static interface Selectors public boolean collectTTLs(); /** - * Checks if one of the selectors collect timestamps. - * @return {@code true} if one of the selectors collect timestamps, {@code false} otherwise. + * Checks if one of the selectors collects write timestamps. + * @return {@code true} if one of the selectors collects write timestamps, {@code false} otherwise. */ - public boolean collectTimestamps(); - - /** - * Checks if one of the selectors collects maxTimestamps. - * @return {@code true} if one of the selectors collect maxTimestamps, {@code false} otherwise. - */ - public boolean collectMaxTimestamps(); + public boolean collectWritetimes(); /** * Adds the current row of the specified ResultSetBuilder. @@ -507,16 +501,11 @@ public boolean collectTTLs() } @Override - public boolean collectTimestamps() + public boolean collectWritetimes() { return false; } - @Override - public boolean collectMaxTimestamps() { - return false; - } - @Override public ColumnFilter getColumnFilter() { @@ -531,8 +520,8 @@ public ColumnFilter getColumnFilter() private static class SelectionWithProcessing extends Selection { private final SelectorFactories factories; - private final boolean collectTimestamps; - private final boolean collectMaxTimestamps; + private final boolean collectWritetimes; + private final boolean collectMaxWritetimes; private final boolean collectTTLs; public SelectionWithProcessing(TableMetadata table, @@ -552,8 +541,8 @@ public SelectionWithProcessing(TableMetadata table, isJson); this.factories = factories; - this.collectTimestamps = factories.containsWritetimeSelectorFactory(); - this.collectMaxTimestamps = factories.containsMaxWritetimeSelectorFactory(); + this.collectWritetimes = factories.containsWritetimeSelectorFactory(); + this.collectMaxWritetimes = factories.containsMaxWritetimeSelectorFactory(); this.collectTTLs = factories.containsTTLSelectorFactory(); for (ColumnMetadata orderingColumn : orderingColumns) @@ -614,7 +603,7 @@ public List getOutputRow() public void addInputRow(InputRow input) { for (Selector selector : selectors) - selector.addInput(options.getProtocolVersion(), input); + selector.addInput(input); } @Override @@ -630,14 +619,9 @@ public boolean collectTTLs() } @Override - public boolean collectTimestamps() + public boolean collectWritetimes() { - return collectTimestamps || collectMaxTimestamps; - } - - @Override - public boolean collectMaxTimestamps() { - return collectMaxTimestamps; + return collectWritetimes || collectMaxWritetimes; } @Override diff --git a/src/java/org/apache/cassandra/cql3/selection/Selector.java b/src/java/org/apache/cassandra/cql3/selection/Selector.java index 8226c2d5d26a..2d52e569cf7f 100644 --- a/src/java/org/apache/cassandra/cql3/selection/Selector.java +++ b/src/java/org/apache/cassandra/cql3/selection/Selector.java @@ -20,26 +20,30 @@ import java.io.IOException; import java.nio.ByteBuffer; import java.util.Arrays; -import java.util.Iterator; import java.util.List; -import org.apache.cassandra.db.rows.ComplexColumnData; -import org.apache.cassandra.schema.CQLTypeParser; -import org.apache.cassandra.schema.KeyspaceMetadata; -import org.apache.cassandra.schema.Schema; -import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.cql3.ColumnIdentifier; import org.apache.cassandra.cql3.ColumnSpecification; import org.apache.cassandra.cql3.QueryOptions; import org.apache.cassandra.cql3.functions.Function; +import org.apache.cassandra.cql3.selection.ColumnTimestamps.TimestampsType; import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.db.context.CounterContext; import org.apache.cassandra.db.filter.ColumnFilter; import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.CollectionType; +import org.apache.cassandra.db.marshal.UserType; import org.apache.cassandra.db.rows.Cell; +import org.apache.cassandra.db.rows.ColumnData; +import org.apache.cassandra.db.rows.ComplexColumnData; import org.apache.cassandra.exceptions.InvalidRequestException; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.CQLTypeParser; +import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.schema.KeyspaceMetadata; +import org.apache.cassandra.schema.Schema; +import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; @@ -178,7 +182,6 @@ public boolean isTTLSelectorFactory() /** * Checks if this factory creates Selectors that simply return a column value. * - * @param index the column index * @return true if this factory creates Selectors that simply return a column value, * false otherwise. */ @@ -301,82 +304,135 @@ protected Selector(Kind kind) */ public static final class InputRow { + private final ProtocolVersion protocolVersion; + private final List columns; + private final boolean collectWritetimes; + private final boolean collectTTLs; + private ByteBuffer[] values; - private final long[] timestamps; - private final int[] ttls; + private RowTimestamps writetimes; + private RowTimestamps ttls; private int index; - public InputRow(int size, boolean collectTimestamps, boolean collectTTLs) + public InputRow(ProtocolVersion protocolVersion, List columns) { - this.values = new ByteBuffer[size]; + this(protocolVersion, columns, false, false); + } - if (collectTimestamps) - { - this.timestamps = new long[size]; - // We use MIN_VALUE to indicate no timestamp - Arrays.fill(timestamps, Long.MIN_VALUE); - } - else - { - timestamps = null; - } + public InputRow(ProtocolVersion protocolVersion, + List columns, + boolean collectWritetimes, + boolean collectTTLs) + { + this.protocolVersion = protocolVersion; + this.columns = columns; + this.collectWritetimes = collectWritetimes; + this.collectTTLs = collectTTLs; + + values = new ByteBuffer[columns.size()]; + writetimes = initTimestamps(TimestampsType.WRITETIMES, collectWritetimes, columns); + ttls = initTimestamps(TimestampsType.TTLS, collectTTLs, columns); + } - if (collectTTLs) - { - this.ttls = new int[size]; - // We use -1 to indicate no ttl - Arrays.fill(ttls, -1); - } - else - { - ttls = null; - } + private RowTimestamps initTimestamps(TimestampsType type, + boolean collectWritetimes, + List columns) + { + return collectWritetimes ? RowTimestamps.newInstance(type, columns) + : RowTimestamps.NOOP_ROW_TIMESTAMPS; } - public void add(ByteBuffer v) + public ProtocolVersion getProtocolVersion() { - add(v, Long.MIN_VALUE, -1); + return protocolVersion; } - public void add(ByteBuffer v, long timestamp, int ttl) + public void add(ByteBuffer v) { values[index] = v; - if (timestamps != null) - timestamps[index] = timestamp; - - if (ttls != null) - ttls[index] = ttl; - + if (v != null) + { + writetimes.addNoTimestamp(index); + ttls.addNoTimestamp(index); + } index++; } - public void add(Cell c, int nowInSec) + public void add(ColumnData columnData, int nowInSec) { - if (c == null) + ColumnMetadata column = columns.get(index); + if (columnData == null) { add(null); - return; } + else + { + if (column.isComplex()) + { + add((ComplexColumnData) columnData, nowInSec); + } + else + { + add((Cell) columnData, nowInSec); + } + } + } + private void add(Cell c, int nowInSec) + { values[index] = value(c); - - if (timestamps != null) - timestamps[index] = c.timestamp(); - - if (ttls != null) - ttls[index] = remainingTTL(c, nowInSec); - + writetimes.addTimestamp(index, c, nowInSec); + ttls.addTimestamp(index, c, nowInSec); index++; } - private int remainingTTL(Cell c, int nowInSec) + private void add(ComplexColumnData ccd, int nowInSec) { - if (!c.isExpiring()) - return -1; + AbstractType type = columns.get(index).type; + if (type.isCollection()) + { + values[index] = ((CollectionType) type).serializeForNativeProtocol(ccd.iterator(), protocolVersion); - int remaining = c.localDeletionTime() - nowInSec; - return remaining >= 0 ? remaining : -1; + for (Cell cell : ccd) + { + writetimes.addTimestamp(index, cell, nowInSec); + ttls.addTimestamp(index, cell, nowInSec); + } + } + else + { + UserType udt = (UserType) type; + int size = udt.size(); + + values[index] = udt.serializeForNativeProtocol(ccd.iterator(), protocolVersion); + + short fieldPosition = 0; + for (Cell cell : ccd) + { + // handle null fields that aren't at the end + short fieldPositionOfCell = ByteBufferUtil.toShort(cell.path().get(0)); + while (fieldPosition < fieldPositionOfCell) + { + fieldPosition++; + writetimes.addNoTimestamp(index); + ttls.addNoTimestamp(index); + } + + fieldPosition++; + writetimes.addTimestamp(index, cell, nowInSec); + ttls.addTimestamp(index, cell, nowInSec); + } + + // append nulls for missing cells + while (fieldPosition < size) + { + fieldPosition++; + writetimes.addNoTimestamp(index); + ttls.addNoTimestamp(index); + } + } + index++; } private ByteBuffer value(Cell c) @@ -408,35 +464,38 @@ public ByteBuffer getValue(int index) public void reset(boolean deep) { index = 0; + this.writetimes = initTimestamps(TimestampsType.WRITETIMES, collectWritetimes, columns); + this.ttls = initTimestamps(TimestampsType.TTLS, collectTTLs, columns); + if (deep) values = new ByteBuffer[values.length]; } /** - * Return the timestamp of the column with the specified index. + * Return the timestamp of the column component with the specified indexes. * - * @param index the column index - * @return the timestamp of the column with the specified index + * @return the timestamp of the cell with the specified indexes */ - public long getTimestamp(int index) + ColumnTimestamps getWritetimes(int columnIndex) { - return timestamps[index]; + return writetimes.get(columnIndex); } /** - * Return the ttl of the column with the specified index. + * Return the ttl of the column component with the specified column and cell indexes. * - * @param index the column index - * @return the ttl of the column with the specified index + * @param columnIndex the column index + * @return the ttl of the column with the specified indexes */ - public int getTtl(int index) + ColumnTimestamps getTtls(int columnIndex) { - return ttls[index]; + return ttls.get(columnIndex); } /** * Returns the column values as list. *

    This content of the list will be shared with the {@code InputRow} unless a deep reset has been done.

    + * * @return the column values as list. */ public List getValues() @@ -448,11 +507,10 @@ public List getValues() /** * Add the current value from the specified ResultSetBuilder. * - * @param protocolVersion protocol version used for serialization * @param input the input row * @throws InvalidRequestException if a problem occurs while adding the input row */ - public abstract void addInput(ProtocolVersion protocolVersion, InputRow input); + public abstract void addInput(InputRow input); /** * Returns the selector output. @@ -463,6 +521,16 @@ public List getValues() */ public abstract ByteBuffer getOutput(ProtocolVersion protocolVersion) throws InvalidRequestException; + protected ColumnTimestamps getWritetimes(ProtocolVersion protocolVersion) + { + throw new UnsupportedOperationException(); + } + + protected ColumnTimestamps getTTLs(ProtocolVersion protocolVersion) + { + throw new UnsupportedOperationException(); + } + /** * Returns the Selector output type. * diff --git a/src/java/org/apache/cassandra/cql3/selection/SetSelector.java b/src/java/org/apache/cassandra/cql3/selection/SetSelector.java index b54b2d4c4af7..cfe398be7ab8 100644 --- a/src/java/org/apache/cassandra/cql3/selection/SetSelector.java +++ b/src/java/org/apache/cassandra/cql3/selection/SetSelector.java @@ -91,10 +91,10 @@ public void addFetchedColumns(Builder builder) elements.get(i).addFetchedColumns(builder); } - public void addInput(ProtocolVersion protocolVersion, InputRow input) + public void addInput(InputRow input) { for (int i = 0, m = elements.size(); i < m; i++) - elements.get(i).addInput(protocolVersion, input); + elements.get(i).addInput(input); } public ByteBuffer getOutput(ProtocolVersion protocolVersion) diff --git a/src/java/org/apache/cassandra/cql3/selection/SimpleSelector.java b/src/java/org/apache/cassandra/cql3/selection/SimpleSelector.java index a6ae4460483b..b4e8404f3e38 100644 --- a/src/java/org/apache/cassandra/cql3/selection/SimpleSelector.java +++ b/src/java/org/apache/cassandra/cql3/selection/SimpleSelector.java @@ -118,6 +118,8 @@ public ColumnMetadata getColumn() public final ColumnMetadata column; private final int idx; private ByteBuffer current; + private ColumnTimestamps writetimes; + private ColumnTimestamps ttls; private boolean isSet; public static Factory newFactory(final ColumnMetadata def, final int idx) @@ -132,12 +134,14 @@ public void addFetchedColumns(Builder builder) } @Override - public void addInput(ProtocolVersion protocolVersion, InputRow input) throws InvalidRequestException + public void addInput(InputRow input) throws InvalidRequestException { if (!isSet) { isSet = true; current = input.getValue(idx); + writetimes = input.getWritetimes(idx); + ttls = input.getTtls(idx); } } @@ -147,11 +151,25 @@ public ByteBuffer getOutput(ProtocolVersion protocolVersion) return current; } + @Override + protected ColumnTimestamps getWritetimes(ProtocolVersion protocolVersion) + { + return writetimes; + } + + @Override + protected ColumnTimestamps getTTLs(ProtocolVersion protocolVersion) + { + return ttls; + } + @Override public void reset() { isSet = false; current = null; + writetimes = null; + ttls = null; } @Override diff --git a/src/java/org/apache/cassandra/cql3/selection/TermSelector.java b/src/java/org/apache/cassandra/cql3/selection/TermSelector.java index 6f0c844dd495..19a60ac92065 100644 --- a/src/java/org/apache/cassandra/cql3/selection/TermSelector.java +++ b/src/java/org/apache/cassandra/cql3/selection/TermSelector.java @@ -101,7 +101,7 @@ public void addFetchedColumns(ColumnFilter.Builder builder) { } - public void addInput(ProtocolVersion protocolVersion, InputRow input) + public void addInput(InputRow input) { } diff --git a/src/java/org/apache/cassandra/cql3/selection/TupleSelector.java b/src/java/org/apache/cassandra/cql3/selection/TupleSelector.java index 0c06bc2e2f3e..f1698724356c 100644 --- a/src/java/org/apache/cassandra/cql3/selection/TupleSelector.java +++ b/src/java/org/apache/cassandra/cql3/selection/TupleSelector.java @@ -89,10 +89,10 @@ public void addFetchedColumns(Builder builder) elements.get(i).addFetchedColumns(builder); } - public void addInput(ProtocolVersion protocolVersion, InputRow input) + public void addInput(InputRow input) { for (int i = 0, m = elements.size(); i < m; i++) - elements.get(i).addInput(protocolVersion, input); + elements.get(i).addInput(input); } public ByteBuffer getOutput(ProtocolVersion protocolVersion) throws InvalidRequestException diff --git a/src/java/org/apache/cassandra/cql3/selection/UserTypeSelector.java b/src/java/org/apache/cassandra/cql3/selection/UserTypeSelector.java index 8007467ec31a..ea9d4c0f1036 100644 --- a/src/java/org/apache/cassandra/cql3/selection/UserTypeSelector.java +++ b/src/java/org/apache/cassandra/cql3/selection/UserTypeSelector.java @@ -182,10 +182,10 @@ public void addFetchedColumns(ColumnFilter.Builder builder) field.addFetchedColumns(builder); } - public void addInput(ProtocolVersion protocolVersion, InputRow input) + public void addInput(InputRow input) { for (Selector field : fields.values()) - field.addInput(protocolVersion, input); + field.addInput(input); } public ByteBuffer getOutput(ProtocolVersion protocolVersion) diff --git a/src/java/org/apache/cassandra/cql3/selection/WritetimeOrTTLSelector.java b/src/java/org/apache/cassandra/cql3/selection/WritetimeOrTTLSelector.java index 29ebfbbdf6ad..9be0b45d6ff4 100644 --- a/src/java/org/apache/cassandra/cql3/selection/WritetimeOrTTLSelector.java +++ b/src/java/org/apache/cassandra/cql3/selection/WritetimeOrTTLSelector.java @@ -22,7 +22,7 @@ import com.google.common.base.Objects; -import org.apache.cassandra.schema.ColumnMetadata; +import org.apache.cassandra.db.marshal.ListType; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.cql3.QueryOptions; import org.apache.cassandra.cql3.ColumnSpecification; @@ -32,51 +32,57 @@ import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; import org.apache.cassandra.transport.ProtocolVersion; -import org.apache.cassandra.utils.ByteBufferUtil; final class WritetimeOrTTLSelector extends Selector { - protected static final SelectorDeserializer deserializer = new SelectorDeserializer() + static final SelectorDeserializer deserializer = new SelectorDeserializer() { + @Override protected Selector deserialize(DataInputPlus in, int version, TableMetadata metadata) throws IOException { - ByteBuffer columnName = ByteBufferUtil.readWithVIntLength(in); - ColumnMetadata column = metadata.getColumn(columnName); + Selector selected = serializer.deserialize(in, version, metadata); int idx = in.readInt(); int ordinal = in.readByte(); - Selectable.WritetimeOrTTL.Kind k = Selectable.WritetimeOrTTL.Kind.fromOrdinal(ordinal); - return new WritetimeOrTTLSelector(column, idx, k); + Selectable.WritetimeOrTTL.Kind kind = Selectable.WritetimeOrTTL.Kind.fromOrdinal(ordinal); + boolean isMultiCell = in.readBoolean(); + return new WritetimeOrTTLSelector(selected, idx, kind, isMultiCell); } }; - private final ColumnMetadata column; - private final int idx; + private final Selector selected; + private final int columnIndex; private final Selectable.WritetimeOrTTL.Kind kind; private ByteBuffer current; + private final boolean isMultiCell; private boolean isSet; - public static Factory newFactory(final ColumnMetadata def, final int idx, final Selectable.WritetimeOrTTL.Kind kind) + public static Factory newFactory(final Selector.Factory factory, final int columnIndex, final Selectable.WritetimeOrTTL.Kind kind, boolean isMultiCell) { return new Factory() { + @Override protected String getColumnName() { - return String.format("%s(%s)", kind.name, def.name.toString()); + return String.format("%s(%s)", kind.name, factory.getColumnName()); } + @Override protected AbstractType getReturnType() { - return kind.returnType; + AbstractType type = kind.returnType; + return isMultiCell && !kind.aggregatesMultiCell() ? ListType.getInstance(type, false) : type; } + @Override protected void addColumnMapping(SelectionColumnMapping mapping, ColumnSpecification resultsColumn) { - mapping.addMapping(resultsColumn, def); + factory.addColumnMapping(mapping, resultsColumn); } + @Override public Selector newInstance(QueryOptions options) { - return new WritetimeOrTTLSelector(def, idx, kind); + return new WritetimeOrTTLSelector(factory.newInstance(options), columnIndex, kind, isMultiCell); } @Override @@ -97,39 +103,46 @@ public boolean isMaxWritetimeSelectorFactory() return kind == Selectable.WritetimeOrTTL.Kind.MAX_WRITE_TIME; } + @Override public boolean areAllFetchedColumnsKnown() { return true; } + @Override public void addFetchedColumns(ColumnFilter.Builder builder) { - builder.add(def); + factory.addFetchedColumns(builder); } }; } public void addFetchedColumns(ColumnFilter.Builder builder) { - builder.add(column); + selected.addFetchedColumns(builder); } - public void addInput(ProtocolVersion protocolVersion, InputRow input) + public void addInput(InputRow input) { if (isSet) return; isSet = true; - if (kind == Selectable.WritetimeOrTTL.Kind.TTL) - { - int ttl = input.getTtl(idx); - current = ttl > 0 ? ByteBufferUtil.bytes(ttl) : null; - } - else + selected.addInput(input); + ProtocolVersion protocolVersion = input.getProtocolVersion(); + + switch (kind) { - long ts = input.getTimestamp(idx); - current = ts != Long.MIN_VALUE ? ByteBufferUtil.bytes(ts) : null; + case WRITE_TIME: + current = selected.getWritetimes(protocolVersion).toByteBuffer(protocolVersion); + break; + case MAX_WRITE_TIME: + current = selected.getWritetimes(protocolVersion).max().toByteBuffer(protocolVersion); + break; + case TTL: + current = selected.getTTLs(protocolVersion).toByteBuffer(protocolVersion); + break; } } @@ -140,27 +153,30 @@ public ByteBuffer getOutput(ProtocolVersion protocolVersion) public void reset() { + selected.reset(); isSet = false; current = null; } public AbstractType getType() { - return kind.returnType; + AbstractType type = kind.returnType; + return isMultiCell ? ListType.getInstance(type, false) : type; } @Override public String toString() { - return column.name.toString(); + return selected.toString(); } - private WritetimeOrTTLSelector(ColumnMetadata column, int idx, Selectable.WritetimeOrTTL.Kind kind) + private WritetimeOrTTLSelector(Selector selected, int idx, Selectable.WritetimeOrTTL.Kind kind, boolean isMultiCell) { super(Kind.WRITETIME_OR_TTL_SELECTOR); - this.column = column; - this.idx = idx; + this.selected = selected; + this.columnIndex = idx; this.kind = kind; + this.isMultiCell = isMultiCell; } @Override @@ -174,30 +190,30 @@ public boolean equals(Object o) WritetimeOrTTLSelector s = (WritetimeOrTTLSelector) o; - return Objects.equal(column, s.column) - && Objects.equal(idx, s.idx) - && kind == s.kind; + return Objects.equal(selected, s.selected) && kind == s.kind; } @Override public int hashCode() { - return Objects.hashCode(column, idx, kind); + return Objects.hashCode(selected, kind); } @Override protected int serializedSize(int version) { - return ByteBufferUtil.serializedSizeWithVIntLength(column.name.bytes) - + TypeSizes.sizeof(idx) - + TypeSizes.sizeofUnsignedVInt(kind.ordinal()); + return serializer.serializedSize(selected, version) + + TypeSizes.sizeof(columnIndex) + + TypeSizes.sizeofUnsignedVInt(kind.ordinal()) + + TypeSizes.sizeof(isMultiCell); } @Override protected void serialize(DataOutputPlus out, int version) throws IOException { - ByteBufferUtil.writeWithVIntLength(column.name.bytes, out); - out.writeInt(idx); + serializer.serialize(selected, out, version); + out.writeInt(columnIndex); out.writeByte(kind.ordinal()); + out.writeBoolean(isMultiCell); } } diff --git a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java index 0d43313e532f..1b2e9371899b 100644 --- a/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java +++ b/src/java/org/apache/cassandra/cql3/statements/SelectStatement.java @@ -52,12 +52,9 @@ import org.apache.cassandra.db.aggregation.AggregationSpecification; import org.apache.cassandra.db.aggregation.GroupMaker; import org.apache.cassandra.db.filter.*; -import org.apache.cassandra.db.marshal.CollectionType; import org.apache.cassandra.db.marshal.CompositeType; import org.apache.cassandra.db.marshal.Int32Type; -import org.apache.cassandra.db.marshal.UserType; import org.apache.cassandra.db.partitions.PartitionIterator; -import org.apache.cassandra.db.rows.ComplexColumnData; import org.apache.cassandra.db.rows.Row; import org.apache.cassandra.db.rows.RowIterator; import org.apache.cassandra.db.view.View; @@ -980,7 +977,7 @@ public void processPartition(RowIterator partition, QueryOptions options, Result { if (!staticRow.isEmpty() && restrictions.returnStaticContentOnPartitionWithNoRows()) { - result.newRow(partition.partitionKey(), staticRow.clustering()); + result.newRow(protocolVersion, partition.partitionKey(), staticRow.clustering(), selection.getColumns()); maybeFail(result, options); for (ColumnMetadata def : selection.getColumns()) { @@ -990,7 +987,7 @@ public void processPartition(RowIterator partition, QueryOptions options, Result result.add(keyComponents[def.position()]); break; case STATIC: - addValue(result, def, staticRow, nowInSec, protocolVersion); + result.add(partition.staticRow().getColumnData(def), nowInSec); break; default: result.add((ByteBuffer)null); @@ -1003,7 +1000,7 @@ public void processPartition(RowIterator partition, QueryOptions options, Result while (partition.hasNext()) { Row row = partition.next(); - result.newRow( partition.partitionKey(), row.clustering()); + result.newRow(protocolVersion, partition.partitionKey(), row.clustering(), selection.getColumns()); // reads aren't failed as soon the size exceeds the failure threshold, they're failed once the failure // threshold has been exceeded and we start adding more data. We're slightly more permissive to avoid @@ -1023,39 +1020,16 @@ public void processPartition(RowIterator partition, QueryOptions options, Result result.add(row.clustering().bufferAt(def.position())); break; case REGULAR: - addValue(result, def, row, nowInSec, protocolVersion); + result.add(row.getColumnData(def), nowInSec); break; case STATIC: - addValue(result, def, staticRow, nowInSec, protocolVersion); + result.add(staticRow.getColumnData(def), nowInSec); break; } } } } - private static void addValue(ResultSetBuilder result, ColumnMetadata def, Row row, int nowInSec, ProtocolVersion protocolVersion) - { - if (def.isComplex()) - { - assert def.type.isMultiCell(); - ComplexColumnData complexData = row.getComplexColumnData(def); - result.add(complexData, iterator -> { - if (def.type.isCollection()) - { - return ((CollectionType) def.type).serializeForNativeProtocol(iterator, protocolVersion); - } - else - { - return ((UserType) def.type).serializeForNativeProtocol(iterator, protocolVersion); - } - }); - } - else - { - result.add(row.getCell(def), nowInSec); - } - } - private boolean needsPostQueryOrdering() { // We need post-query ordering only for queries with IN on the partition key and an ORDER BY. @@ -1320,6 +1294,7 @@ private AggregationSpecification.Factory getAggregationSpecFactory(TableMetadata int clusteringPrefixSize = 0; Iterator pkColumns = metadata.primaryKeyColumns().iterator(); + List columns = null; Selector.Factory selectorFactory = null; for (Selectable.Raw raw : parameters.groups) { @@ -1331,7 +1306,7 @@ private AggregationSpecification.Factory getAggregationSpecFactory(TableMetadata { WithFunction withFunction = (WithFunction) selectable; validateGroupByFunction(withFunction); - List columns = new ArrayList(); + columns = new ArrayList(); selectorFactory = selectable.newSelectorFactory(metadata, null, columns, boundNames); checkFalse(columns.isEmpty(), "GROUP BY functions must have one clustering column name as parameter"); if (columns.size() > 1) @@ -1379,7 +1354,8 @@ private AggregationSpecification.Factory getAggregationSpecFactory(TableMetadata return selectorFactory == null ? AggregationSpecification.aggregatePkPrefixFactory(metadata.comparator, clusteringPrefixSize) : AggregationSpecification.aggregatePkPrefixFactoryWithSelector(metadata.comparator, clusteringPrefixSize, - selectorFactory); + selectorFactory, + columns); } /** diff --git a/src/java/org/apache/cassandra/db/aggregation/AggregationSpecification.java b/src/java/org/apache/cassandra/db/aggregation/AggregationSpecification.java index 0d6c0eef698b..df47e58d43b2 100644 --- a/src/java/org/apache/cassandra/db/aggregation/AggregationSpecification.java +++ b/src/java/org/apache/cassandra/db/aggregation/AggregationSpecification.java @@ -18,6 +18,7 @@ package org.apache.cassandra.db.aggregation; import java.io.IOException; +import java.util.Collections; import java.util.List; import org.apache.cassandra.cql3.QueryOptions; @@ -27,6 +28,7 @@ import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.io.util.DataInputPlus; import org.apache.cassandra.io.util.DataOutputPlus; +import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.TableMetadata; /** @@ -61,7 +63,7 @@ public GroupMaker newGroupMaker(GroupingState state) /** * The AggregationSpecification kinds. */ - public static enum Kind + public enum Kind { AGGREGATE_EVERYTHING, AGGREGATE_BY_PK_PREFIX, AGGREGATE_BY_PK_PREFIX_WITH_SELECTOR } @@ -115,7 +117,8 @@ public static AggregationSpecification.Factory aggregatePkPrefixFactory(Clusteri public static AggregationSpecification.Factory aggregatePkPrefixFactoryWithSelector(final ClusteringComparator comparator, final int clusteringPrefixSize, - final Selector.Factory factory) + final Selector.Factory factory, + final List columns) { return new Factory() { @@ -131,8 +134,9 @@ public AggregationSpecification newInstance(QueryOptions options) Selector selector = factory.newInstance(options); selector.validateForGroupBy(); return new AggregateByPkPrefixWithSelector(comparator, - clusteringPrefixSize, - selector); + clusteringPrefixSize, + selector, + columns); } }; } @@ -200,18 +204,25 @@ private static final class AggregateByPkPrefixWithSelector extends AggregateByPk */ private final Selector selector; + /** + * The columns used by the selector. + */ + private final List columns; + public AggregateByPkPrefixWithSelector(ClusteringComparator comparator, int clusteringPrefixSize, - Selector selector) + Selector selector, + List columns) { super(Kind.AGGREGATE_BY_PK_PREFIX_WITH_SELECTOR, comparator, clusteringPrefixSize); this.selector = selector; + this.columns = columns; } @Override public GroupMaker newGroupMaker(GroupingState state) { - return GroupMaker.newSelectorGroupMaker(comparator, clusteringPrefixSize, selector, state); + return GroupMaker.newSelectorGroupMaker(comparator, clusteringPrefixSize, selector, columns, state); } } @@ -231,6 +242,9 @@ public void serialize(AggregationSpecification aggregationSpec, DataOutputPlus o AggregateByPkPrefixWithSelector spec = (AggregateByPkPrefixWithSelector) aggregationSpec; out.writeUnsignedVInt(spec.clusteringPrefixSize); Selector.serializer.serialize(spec.selector, out, version); + // Ideally we should serialize the columns but that will break backward compatibility. + // So for the moment we can rebuild the list from the prefix size as we know that there will be + // only one column and that its indice will be: clusteringPrefixSize - 1. break; default: throw new AssertionError("Unknow aggregation kind: " + aggregationSpec.kind()); @@ -249,9 +263,11 @@ public AggregationSpecification deserialize(DataInputPlus in, int version, Table case AGGREGATE_BY_PK_PREFIX_WITH_SELECTOR: int clusteringPrefixSize = (int) in.readUnsignedVInt(); Selector selector = Selector.serializer.deserialize(in, version, metadata); + ColumnMetadata functionArgument = metadata.clusteringColumns().get(clusteringPrefixSize - 1); return new AggregateByPkPrefixWithSelector(metadata.comparator, clusteringPrefixSize, - selector); + selector, + Collections.singletonList(functionArgument)); default: throw new AssertionError("Unknow aggregation kind: " + kind); } @@ -270,9 +286,7 @@ public long serializedSize(AggregationSpecification aggregationSpec, int version case AGGREGATE_BY_PK_PREFIX_WITH_SELECTOR: AggregateByPkPrefixWithSelector spec = (AggregateByPkPrefixWithSelector) aggregationSpec; size += TypeSizes.sizeofUnsignedVInt(spec.clusteringPrefixSize); - size += Selector.serializer.serializedSize(spec.selector, version - - ); + size += Selector.serializer.serializedSize(spec.selector, version); break; default: throw new AssertionError("Unknow aggregation kind: " + aggregationSpec.kind()); diff --git a/src/java/org/apache/cassandra/db/aggregation/GroupMaker.java b/src/java/org/apache/cassandra/db/aggregation/GroupMaker.java index 968219f5cefe..a30bba535468 100644 --- a/src/java/org/apache/cassandra/db/aggregation/GroupMaker.java +++ b/src/java/org/apache/cassandra/db/aggregation/GroupMaker.java @@ -18,11 +18,13 @@ package org.apache.cassandra.db.aggregation; import java.nio.ByteBuffer; +import java.util.List; import org.apache.cassandra.cql3.selection.Selector; import org.apache.cassandra.db.Clustering; import org.apache.cassandra.db.ClusteringComparator; import org.apache.cassandra.db.DecoratedKey; +import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.transport.ProtocolVersion; /** @@ -61,16 +63,18 @@ public static GroupMaker newPkPrefixGroupMaker(ClusteringComparator comparator, public static GroupMaker newSelectorGroupMaker(ClusteringComparator comparator, int clusteringPrefixSize, Selector selector, + List columns, GroupingState state) { - return new SelectorGroupMaker(comparator, clusteringPrefixSize, selector, state); + return new SelectorGroupMaker(comparator, clusteringPrefixSize, selector, columns, state); } public static GroupMaker newSelectorGroupMaker(ClusteringComparator comparator, int clusteringPrefixSize, - Selector selector) + Selector selector, + List columns) { - return new SelectorGroupMaker(comparator, clusteringPrefixSize, selector); + return new SelectorGroupMaker(comparator, clusteringPrefixSize, selector, columns); } /** @@ -158,25 +162,29 @@ private static class SelectorGroupMaker extends PkPrefixGroupMaker */ private ByteBuffer lastOutput; - private final Selector.InputRow input = new Selector.InputRow(1, false, false); + private final Selector.InputRow input; public SelectorGroupMaker(ClusteringComparator comparator, int clusteringPrefixSize, Selector selector, + List columns, GroupingState state) { super(comparator, clusteringPrefixSize, state); this.selector = selector; + this.input = new Selector.InputRow(ProtocolVersion.CURRENT, columns); this.lastOutput = lastClustering == null ? null : executeSelector(lastClustering.bufferAt(clusteringPrefixSize - 1)); } public SelectorGroupMaker(ClusteringComparator comparator, int clusteringPrefixSize, - Selector selector) + Selector selector, + List columns) { super(comparator, clusteringPrefixSize); this.selector = selector; + this.input = new Selector.InputRow(ProtocolVersion.CURRENT, columns); } @Override @@ -217,7 +225,7 @@ private ByteBuffer executeSelector(ByteBuffer argument) input.add(argument); // For computing groups we do not need to use the client protocol version. - selector.addInput(ProtocolVersion.CURRENT, input); + selector.addInput(input); ByteBuffer output = selector.getOutput(ProtocolVersion.CURRENT); selector.reset(); input.reset(false); diff --git a/src/java/org/apache/cassandra/db/rows/BTreeRow.java b/src/java/org/apache/cassandra/db/rows/BTreeRow.java index f07d0760c19c..125932be0f1d 100644 --- a/src/java/org/apache/cassandra/db/rows/BTreeRow.java +++ b/src/java/org/apache/cassandra/db/rows/BTreeRow.java @@ -280,7 +280,12 @@ public Cell getCell(ColumnMetadata c, CellPath path) public ComplexColumnData getComplexColumnData(ColumnMetadata c) { assert c.isComplex(); - return (ComplexColumnData) BTree.find(btree, ColumnMetadata.asymmetricColumnDataComparator, c); + return (ComplexColumnData) getColumnData(c); + } + + public ColumnData getColumnData(ColumnMetadata c) + { + return (ColumnData) BTree.find(btree, ColumnMetadata.asymmetricColumnDataComparator, c); } @Override diff --git a/src/java/org/apache/cassandra/db/rows/Row.java b/src/java/org/apache/cassandra/db/rows/Row.java index 33eb13f90862..9e8276a8d779 100644 --- a/src/java/org/apache/cassandra/db/rows/Row.java +++ b/src/java/org/apache/cassandra/db/rows/Row.java @@ -149,6 +149,14 @@ public interface Row extends Unfiltered, Iterable, IMeasurableMemory */ public ComplexColumnData getComplexColumnData(ColumnMetadata c); + /** + * Returns the {@link ColumnData} for the specified column. + * + * @param c the column for which to fetch the data. + * @return the data for the column or {@code null} if the row has no data for this column. + */ + public ColumnData getColumnData(ColumnMetadata c); + /** * An iterable over the cells of this row. *

    diff --git a/src/java/org/apache/cassandra/serializers/AbstractMapSerializer.java b/src/java/org/apache/cassandra/serializers/AbstractMapSerializer.java new file mode 100644 index 000000000000..52255a1fdecd --- /dev/null +++ b/src/java/org/apache/cassandra/serializers/AbstractMapSerializer.java @@ -0,0 +1,213 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.serializers; + +import java.nio.BufferUnderflowException; +import java.nio.ByteBuffer; + +import com.google.common.collect.Range; + +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.ByteBufferAccessor; +import org.apache.cassandra.transport.ProtocolVersion; +import org.apache.cassandra.utils.ByteBufferUtil; + +/** + * Common superclass for {@link SetSerializer} and {@link MapSerializer}, considering a set as a map without values. + */ +abstract class AbstractMapSerializer extends CollectionSerializer +{ + private final boolean hasValues; + private final String name; + + protected AbstractMapSerializer(boolean hasValues) + { + this.hasValues = hasValues; + name = hasValues ? "map" : "set"; + } + + @Override + public ByteBuffer getSliceFromSerialized(ByteBuffer collection, + ByteBuffer from, + ByteBuffer to, + AbstractType comparator, + boolean frozen) + { + if (from == ByteBufferUtil.UNSET_BYTE_BUFFER && to == ByteBufferUtil.UNSET_BYTE_BUFFER) + return collection; + + try + { + ByteBuffer input = collection.duplicate(); + int n = readCollectionSize(input, ByteBufferAccessor.instance, ProtocolVersion.V3); + input.position(input.position() + sizeOfCollectionSize(n, ProtocolVersion.V3)); + int startPos = input.position(); + int count = 0; + boolean inSlice = from == ByteBufferUtil.UNSET_BYTE_BUFFER; + + for (int i = 0; i < n; i++) + { + int pos = input.position(); + ByteBuffer key = readValue(input, ByteBufferAccessor.instance, 0, ProtocolVersion.V3); // key + input.position(input.position() + sizeOfValue(key, ByteBufferAccessor.instance, ProtocolVersion.V3)); + + // If we haven't passed the start already, check if we have now + if (!inSlice) + { + int comparison = comparator.compareForCQL(from, key); + if (comparison <= 0) + { + // We're now within the slice + inSlice = true; + startPos = pos; + } + else + { + // We're before the slice, so we know we don't care about this element + skipMapValue(input); + continue; + } + } + + // Now check if we're done + int comparison = to == ByteBufferUtil.UNSET_BYTE_BUFFER ? -1 : comparator.compareForCQL(key, to); + if (comparison > 0) + { + // We're done and shouldn't include the key we just read + input.position(pos); + break; + } + + // Otherwise, we'll include that element + skipMapValue(input); // value + ++count; + + // But if we know it was the last of the slice, we break early + if (comparison == 0) + break; + } + + if (count == 0 && !frozen) + return null; + + return copyAsNewCollection(collection, count, startPos, input.position(), ProtocolVersion.V3); + } + catch (BufferUnderflowException | IndexOutOfBoundsException e) + { + throw new MarshalException("Not enough bytes to read a " + name); + } + } + + @Override + public int getIndexFromSerialized(ByteBuffer collection, ByteBuffer key, AbstractType comparator) + { + try + { + ByteBuffer input = collection.duplicate(); + int n = readCollectionSize(input, ByteBufferAccessor.instance, ProtocolVersion.V3); + int offset = sizeOfCollectionSize(n, ProtocolVersion.V3); + for (int i = 0; i < n; i++) + { + ByteBuffer kbb = readValue(input, ByteBufferAccessor.instance, offset, ProtocolVersion.V3); + offset += sizeOfValue(kbb, ByteBufferAccessor.instance, ProtocolVersion.V3); + int comparison = comparator.compareForCQL(kbb, key); + + if (comparison == 0) + return i; + + if (comparison > 0) + // since the set is in sorted order, we know we've gone too far and the element doesn't exist + return -1; + + // comparison < 0 + if (hasValues) + offset += skipValue(input, ByteBufferAccessor.instance, offset, ProtocolVersion.V3); + } + return -1; + } + catch (BufferUnderflowException e) + { + throw new MarshalException("Not enough bytes to read a " + name); + } + } + + @Override + public Range getIndexesRangeFromSerialized(ByteBuffer collection, + ByteBuffer from, + ByteBuffer to, + AbstractType comparator) + { + if (from == ByteBufferUtil.UNSET_BYTE_BUFFER && to == ByteBufferUtil.UNSET_BYTE_BUFFER) + return Range.closed(0, Integer.MAX_VALUE); + + try + { + ByteBuffer input = collection.duplicate(); + int n = readCollectionSize(input, ByteBufferAccessor.instance, ProtocolVersion.V3); + input.position(input.position() + sizeOfCollectionSize(n, ProtocolVersion.V3)); + int start = from == ByteBufferUtil.UNSET_BYTE_BUFFER ? 0 : -1; + int end = to == ByteBufferUtil.UNSET_BYTE_BUFFER ? n : -1; + + for (int i = 0; i < n; i++) + { + if (start >= 0 && end >= 0) + break; + else if (i > 0) + skipMapValue(input); + + ByteBuffer key = readValue(input, ByteBufferAccessor.instance, 0, ProtocolVersion.V3); + input.position(input.position() + sizeOfValue(key, ByteBufferAccessor.instance, ProtocolVersion.V3)); + + if (start < 0) + { + int comparison = comparator.compareForCQL(from, key); + if (comparison <= 0) + start = i; + else + continue; + } + + if (end < 0) + { + int comparison = comparator.compareForCQL(key, to); + if (comparison > 0) + end = i; + } + } + + if (start < 0) + return Range.closedOpen(0, 0); + + if (end < 0) + return Range.closedOpen(start, n); + + return Range.closedOpen(start, end); + } + catch (BufferUnderflowException e) + { + throw new MarshalException("Not enough bytes to read a " + name); + } + } + + private void skipMapValue(ByteBuffer input) + { + if (hasValues) + skipValue(input, ProtocolVersion.V3); + } +} diff --git a/src/java/org/apache/cassandra/serializers/CollectionSerializer.java b/src/java/org/apache/cassandra/serializers/CollectionSerializer.java index 204261d46fd7..36e346c3e752 100644 --- a/src/java/org/apache/cassandra/serializers/CollectionSerializer.java +++ b/src/java/org/apache/cassandra/serializers/CollectionSerializer.java @@ -22,6 +22,8 @@ import java.util.Collection; import java.util.List; +import com.google.common.collect.Range; + import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.db.marshal.ByteBufferAccessor; import org.apache.cassandra.db.marshal.ValueAccessor; @@ -174,6 +176,37 @@ public abstract ByteBuffer getSliceFromSerialized(ByteBuffer collection, AbstractType comparator, boolean frozen); + /** + * Returns the index of an element in a serialized collection. + *

    + * Note that this is only supported by sets and maps, but not by lists. + * + * @param collection The serialized collection. This cannot be {@code null}. + * @param key The key for which the index must be found. This cannot be {@code null} nor + * {@link ByteBufferUtil#UNSET_BYTE_BUFFER}). + * @param comparator The type to use to compare the {@code key} value to those in the collection. + * @return The index of the element associated with {@code key} if one exists, {@code -1} otherwise. + */ + public abstract int getIndexFromSerialized(ByteBuffer collection, ByteBuffer key, AbstractType comparator); + + /** + * Returns the range of indexes corresponding to the specified range of elements in the serialized collection. + *

    + * Note that this is only supported by sets and maps, but not by lists. + * + * @param collection The serialized collection. This cannot be {@code null}. + * @param from The left bound of the slice to extract. This cannot be {@code null} but if this is + * {@link ByteBufferUtil#UNSET_BYTE_BUFFER}, then the returned slice starts at the beginning of the collection. + * @param to The left bound of the slice to extract. This cannot be {@code null} but if this is + * {@link ByteBufferUtil#UNSET_BYTE_BUFFER}, then the returned slice ends at the end of the collection. + * @param comparator The type to use to compare the {@code from} and {@code to} values to those in the collection. + * @return The range of indexes corresponding to specified range of elements. + */ + public abstract Range getIndexesRangeFromSerialized(ByteBuffer collection, + ByteBuffer from, + ByteBuffer to, + AbstractType comparator); + /** * Creates a new serialized map composed from the data from {@code input} between {@code startPos} * (inclusive) and {@code endPos} (exclusive), assuming that data holds {@code count} elements. diff --git a/src/java/org/apache/cassandra/serializers/ListSerializer.java b/src/java/org/apache/cassandra/serializers/ListSerializer.java index 429be986b78f..ef5a037abcd2 100644 --- a/src/java/org/apache/cassandra/serializers/ListSerializer.java +++ b/src/java/org/apache/cassandra/serializers/ListSerializer.java @@ -20,12 +20,13 @@ import java.nio.BufferUnderflowException; import java.nio.ByteBuffer; -import java.nio.charset.CharacterCodingException; import java.util.*; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; import java.util.function.Predicate; +import com.google.common.collect.Range; + import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.ByteBufferAccessor; @@ -35,15 +36,17 @@ public class ListSerializer extends CollectionSerializer> { // interning instances - private static final ConcurrentMap, ListSerializer> instances = new ConcurrentHashMap, ListSerializer>(); + @SuppressWarnings("rawtypes") + private static final ConcurrentMap, ListSerializer> instances = new ConcurrentHashMap<>(); public final TypeSerializer elements; + @SuppressWarnings("unchecked") public static ListSerializer getInstance(TypeSerializer elements) { ListSerializer t = instances.get(elements); if (t == null) - t = instances.computeIfAbsent(elements, k -> new ListSerializer<>(k) ); + t = instances.computeIfAbsent(elements, ListSerializer::new); return t; } @@ -101,7 +104,7 @@ public List deserializeForNativeProtocol(V input, ValueAccessor access // In such a case we do not want to initialize the list with that size as it can result // in an OOM (see CASSANDRA-12618). On the other hand we do not want to have to resize the list // if we can avoid it, so we put a reasonable limit on the initialCapacity. - List l = new ArrayList(Math.min(n, 256)); + List l = new ArrayList<>(Math.min(n, 256)); for (int i = 0; i < n; i++) { // We can have nulls in lists that are used for IN values @@ -214,6 +217,7 @@ public String toString(List value) return sb.toString(); } + @SuppressWarnings({ "rawtypes", "unchecked" }) public Class> getType() { return (Class) List.class; @@ -222,7 +226,7 @@ public Class> getType() @Override public ByteBuffer getSerializedValue(ByteBuffer collection, ByteBuffer key, AbstractType comparator) { - // We don't allow selecting an element of a list so we don't need this. + // We don't allow selecting an element of a list, so we don't need this. throw new UnsupportedOperationException(); } @@ -233,7 +237,22 @@ public ByteBuffer getSliceFromSerialized(ByteBuffer collection, AbstractType comparator, boolean frozen) { - // We don't allow slicing of list so we don't need this. + // We don't allow slicing of lists, so we don't need this. + throw new UnsupportedOperationException(); + } + + @Override + public int getIndexFromSerialized(ByteBuffer collection, ByteBuffer key, AbstractType comparator) + { + throw new UnsupportedOperationException(); + } + + @Override + public Range getIndexesRangeFromSerialized(ByteBuffer collection, + ByteBuffer from, + ByteBuffer to, + AbstractType comparator) + { throw new UnsupportedOperationException(); } } diff --git a/src/java/org/apache/cassandra/serializers/MapSerializer.java b/src/java/org/apache/cassandra/serializers/MapSerializer.java index 400a8e7cc5fb..b3e4cba99493 100644 --- a/src/java/org/apache/cassandra/serializers/MapSerializer.java +++ b/src/java/org/apache/cassandra/serializers/MapSerializer.java @@ -26,21 +26,22 @@ import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.ByteBufferAccessor; -import org.apache.cassandra.db.marshal.ValueComparators; import org.apache.cassandra.db.marshal.ValueAccessor; +import org.apache.cassandra.db.marshal.ValueComparators; import org.apache.cassandra.transport.ProtocolVersion; -import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.Pair; -public class MapSerializer extends CollectionSerializer> +public class MapSerializer extends AbstractMapSerializer> { // interning instances + @SuppressWarnings("rawtypes") private static final ConcurrentMap, TypeSerializer>, MapSerializer> instances = new ConcurrentHashMap<>(); public final TypeSerializer keys; public final TypeSerializer values; private final ValueComparators comparators; + @SuppressWarnings("unchecked") public static MapSerializer getInstance(TypeSerializer keys, TypeSerializer values, ValueComparators comparators) { Pair, TypeSerializer> p = Pair.create(keys, values); @@ -52,6 +53,7 @@ public static MapSerializer getInstance(TypeSerializer keys, Typ private MapSerializer(TypeSerializer keys, TypeSerializer values, ValueComparators comparators) { + super(true); this.keys = keys; this.values = values; this.comparators = comparators; @@ -62,7 +64,7 @@ public List serializeValues(Map map) List> pairs = new ArrayList<>(map.size()); for (Map.Entry entry : map.entrySet()) pairs.add(Pair.create(keys.serialize(entry.getKey()), values.serialize(entry.getValue()))); - Collections.sort(pairs, (l, r) -> comparators.buffer.compare(l.left, r.left)); + pairs.sort((l, r) -> comparators.buffer.compare(l.left, r.left)); List buffers = new ArrayList<>(pairs.size() * 2); for (Pair p : pairs) { @@ -83,7 +85,7 @@ public void validateForNativeProtocol(T input, ValueAccessor accessor, Pr { // Empty values are still valid. if (accessor.isEmpty(input)) return; - + int n = readCollectionSize(input, accessor, version); int offset = sizeOfCollectionSize(n, version); for (int i = 0; i < n; i++) @@ -119,7 +121,7 @@ public Map deserializeForNativeProtocol(I input, ValueAccessor acce // In such a case we do not want to initialize the map with that initialCapacity as it can result // in an OOM when put is called (see CASSANDRA-12618). On the other hand we do not want to have to resize // the map if we can avoid it, so we put a reasonable limit on the initialCapacity. - Map m = new LinkedHashMap(Math.min(n, 256)); + Map m = new LinkedHashMap<>(Math.min(n, 256)); for (int i = 0; i < n; i++) { I key = readValue(input, accessor, offset, version); @@ -171,78 +173,6 @@ else if (comparison > 0) } } - @Override - public ByteBuffer getSliceFromSerialized(ByteBuffer collection, - ByteBuffer from, - ByteBuffer to, - AbstractType comparator, - boolean frozen) - { - if (from == ByteBufferUtil.UNSET_BYTE_BUFFER && to == ByteBufferUtil.UNSET_BYTE_BUFFER) - return collection; - - try - { - ByteBuffer input = collection.duplicate(); - int n = readCollectionSize(input, ByteBufferAccessor.instance, ProtocolVersion.V3); - input.position(input.position() + sizeOfCollectionSize(n, ProtocolVersion.V3)); - int startPos = input.position(); - int count = 0; - boolean inSlice = from == ByteBufferUtil.UNSET_BYTE_BUFFER; - - for (int i = 0; i < n; i++) - { - int pos = input.position(); - ByteBuffer kbb = readValue(input, ByteBufferAccessor.instance, 0, ProtocolVersion.V3); // key - input.position(input.position() + sizeOfValue(kbb, ByteBufferAccessor.instance, ProtocolVersion.V3)); - - // If we haven't passed the start already, check if we have now - if (!inSlice) - { - int comparison = comparator.compareForCQL(from, kbb); - if (comparison <= 0) - { - // We're now within the slice - inSlice = true; - startPos = pos; - } - else - { - // We're before the slice so we know we don't care about this element - skipValue(input, ProtocolVersion.V3); // value - continue; - } - } - - // Now check if we're done - int comparison = to == ByteBufferUtil.UNSET_BYTE_BUFFER ? -1 : comparator.compareForCQL(kbb, to); - if (comparison > 0) - { - // We're done and shouldn't include the key we just read - input.position(pos); - break; - } - - // Otherwise, we'll include that element - skipValue(input, ProtocolVersion.V3); // value - ++count; - - // But if we know if was the last of the slice, we break early - if (comparison == 0) - break; - } - - if (count == 0 && !frozen) - return null; - - return copyAsNewCollection(collection, count, startPos, input.position(), ProtocolVersion.V3); - } - catch (BufferUnderflowException | IndexOutOfBoundsException e) - { - throw new MarshalException("Not enough bytes to read a map"); - } - } - public String toString(Map value) { StringBuilder sb = new StringBuilder(); @@ -262,8 +192,9 @@ public String toString(Map value) return sb.toString(); } + @SuppressWarnings({ "rawtypes", "unchecked" }) public Class> getType() { - return (Class)Map.class; + return (Class) Map.class; } } diff --git a/src/java/org/apache/cassandra/serializers/SetSerializer.java b/src/java/org/apache/cassandra/serializers/SetSerializer.java index 54b849609682..bbf7911aaf89 100644 --- a/src/java/org/apache/cassandra/serializers/SetSerializer.java +++ b/src/java/org/apache/cassandra/serializers/SetSerializer.java @@ -24,22 +24,22 @@ import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentMap; +import org.apache.cassandra.db.marshal.AbstractType; import org.apache.cassandra.db.marshal.ByteBufferAccessor; -import org.apache.cassandra.db.marshal.ValueComparators; import org.apache.cassandra.db.marshal.ValueAccessor; +import org.apache.cassandra.db.marshal.ValueComparators; import org.apache.cassandra.transport.ProtocolVersion; -import org.apache.cassandra.db.marshal.AbstractType; -import org.apache.cassandra.utils.ByteBufferUtil; - -public class SetSerializer extends CollectionSerializer> +public class SetSerializer extends AbstractMapSerializer> { // interning instances - private static final ConcurrentMap, SetSerializer> instances = new ConcurrentHashMap, SetSerializer>(); + @SuppressWarnings("rawtypes") + private static final ConcurrentMap, SetSerializer> instances = new ConcurrentHashMap<>(); public final TypeSerializer elements; private final ValueComparators comparators; + @SuppressWarnings("unchecked") public static SetSerializer getInstance(TypeSerializer elements, ValueComparators comparators) { SetSerializer t = instances.get(elements); @@ -50,6 +50,7 @@ public static SetSerializer getInstance(TypeSerializer elements, Value public SetSerializer(TypeSerializer elements, ValueComparators comparators) { + super(false); this.elements = elements; this.comparators = comparators; } @@ -59,7 +60,7 @@ public List serializeValues(Set values) List buffers = new ArrayList<>(values.size()); for (T value : values) buffers.add(elements.serialize(value)); - Collections.sort(buffers, comparators.buffer); + buffers.sort(comparators.buffer); return buffers; } @@ -106,7 +107,7 @@ public Set deserializeForNativeProtocol(V input, ValueAccessor accesso // In such a case we do not want to initialize the set with that initialCapacity as it can result // in an OOM when add is called (see CASSANDRA-12618). On the other hand we do not want to have to resize // the set if we can avoid it, so we put a reasonable limit on the initialCapacity. - Set l = new LinkedHashSet(Math.min(n, 256)); + Set l = new LinkedHashSet<>(Math.min(n, 256)); for (int i = 0; i < n; i++) { @@ -146,6 +147,7 @@ public String toString(Set value) return sb.toString(); } + @SuppressWarnings({ "rawtypes", "unchecked" }) public Class> getType() { return (Class) Set.class; @@ -178,74 +180,4 @@ else if (comparison > 0) throw new MarshalException("Not enough bytes to read a set"); } } - - @Override - public ByteBuffer getSliceFromSerialized(ByteBuffer collection, - ByteBuffer from, - ByteBuffer to, - AbstractType comparator, - boolean frozen) - { - if (from == ByteBufferUtil.UNSET_BYTE_BUFFER && to == ByteBufferUtil.UNSET_BYTE_BUFFER) - return collection; - - try - { - ByteBuffer input = collection.duplicate(); - int n = readCollectionSize(input, ByteBufferAccessor.instance, ProtocolVersion.V3); - input.position(input.position() + sizeOfCollectionSize(n, ProtocolVersion.V3)); - int startPos = input.position(); - int count = 0; - boolean inSlice = from == ByteBufferUtil.UNSET_BYTE_BUFFER; - - for (int i = 0; i < n; i++) - { - int pos = input.position(); - ByteBuffer value = readValue(input, ByteBufferAccessor.instance, 0, ProtocolVersion.V3); - input.position(input.position() + sizeOfValue(value, ByteBufferAccessor.instance, ProtocolVersion.V3)); - - // If we haven't passed the start already, check if we have now - if (!inSlice) - { - int comparison = comparator.compareForCQL(from, value); - if (comparison <= 0) - { - // We're now within the slice - inSlice = true; - startPos = pos; - } - else - { - // We're before the slice so we know we don't care about this value - continue; - } - } - - // Now check if we're done - int comparison = to == ByteBufferUtil.UNSET_BYTE_BUFFER ? -1 : comparator.compareForCQL(value, to); - if (comparison > 0) - { - // We're done and shouldn't include the value we just read - input.position(pos); - break; - } - - // Otherwise, we'll include that value - ++count; - - // But if we know if was the last of the slice, we break early - if (comparison == 0) - break; - } - - if (count == 0 && !frozen) - return null; - - return copyAsNewCollection(collection, count, startPos, input.position(), ProtocolVersion.V3); - } - catch (BufferUnderflowException | IndexOutOfBoundsException e) - { - throw new MarshalException("Not enough bytes to read a set"); - } - } } diff --git a/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeWritetimeOrTTLTest.java b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeWritetimeOrTTLTest.java new file mode 100644 index 000000000000..295ffd29877c --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/upgrade/MixedModeWritetimeOrTTLTest.java @@ -0,0 +1,119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.upgrade; + +import java.util.Arrays; +import java.util.List; + +import org.junit.Test; + +import org.apache.cassandra.distributed.api.ICoordinator; +import org.assertj.core.api.Assertions; + +import static org.apache.cassandra.distributed.api.ConsistencyLevel.ALL; +import static org.apache.cassandra.distributed.shared.AssertUtils.assertRows; +import static org.apache.cassandra.distributed.shared.AssertUtils.row; + +/** + * Tests the CQL functions {@code writetime}, {@code maxwritetime} and {@code ttl} on rolling upgrade. + * + * {@code writetime} and {@code ttl} on single-cell columns is always supported, even in mixed clusters. + * {@code writetime} and {@code ttl} on multi-cell columns is not supported in coordinator nodes < 4.2. + * {@code maxwritetime} is not supported in coordinator nodes < 4.2. + */ +public class MixedModeWritetimeOrTTLTest extends UpgradeTestBase +{ + @Test + public void testWritetimeOrTTLDuringUpgrade() throws Throwable + { + new TestCase() + .nodes(2) + .nodesToUpgradeOrdered(1, 2) + .upgradesToCurrentFrom(v30) + .setup(cluster -> { + + ICoordinator coordinator = cluster.coordinator(1); + cluster.schemaChange(withKeyspace("CREATE TABLE %s.t (k int PRIMARY KEY, v int, s set, fs frozen>)")); + coordinator.execute(withKeyspace("INSERT INTO %s.t (k, v, s, fs) VALUES (0, 0, {0, 1}, {0, 1, 2, 3}) USING TIMESTAMP 1 AND TTL 1000"), ALL); + coordinator.execute(withKeyspace("UPDATE %s.t USING TIMESTAMP 2 AND TTL 2000 SET v = 1, s = s + {2, 3} WHERE k = 0"), ALL); + + assertPre42Behaviour(cluster.coordinator(1)); + assertPre42Behaviour(cluster.coordinator(2)); + }) + .runAfterNodeUpgrade((cluster, node) -> { + if (node == 1) // only node1 is upgraded, and the cluster is in mixed mode + { + assertPost42Behaviour(cluster.coordinator(1)); + assertPre42Behaviour(cluster.coordinator(2)); + } + else // both nodes have been upgraded, and the cluster isn't in mixed mode anymore + { + assertPost42Behaviour(cluster.coordinator(1)); + assertPost42Behaviour(cluster.coordinator(2)); + } + }) + .run(); + } + + private void assertPre42Behaviour(ICoordinator coordinator) + { + // regular column, supported except for maxwritetime + assertRows(coordinator.execute(withKeyspace("SELECT writetime(v) FROM %s.t"), ALL), row(2L)); + Assertions.assertThatThrownBy(() -> coordinator.execute(withKeyspace("SELECT maxwritetime(v) FROM %s.t"), ALL)) + .hasMessageContaining("Unknown function 'maxwritetime'"); + Assertions.assertThat((Integer) coordinator.execute(withKeyspace("SELECT ttl(v) FROM %s.t"), ALL)[0][0]) + .isLessThanOrEqualTo(2000).isGreaterThan(2000 - 300); // margin of error of 5 minutes since TTLs decrease + + // frozen collection, supported except for maxwritetime + assertRows(coordinator.execute(withKeyspace("SELECT writetime(fs) FROM %s.t"), ALL), row(1L)); + Assertions.assertThatThrownBy(() -> coordinator.execute(withKeyspace("SELECT maxwritetime(fs) FROM %s.t"), ALL)) + .hasMessageContaining("Unknown function 'maxwritetime'"); + Assertions.assertThat((Integer) coordinator.execute(withKeyspace("SELECT ttl(fs) FROM %s.t"), ALL)[0][0]) + .isLessThanOrEqualTo(1000).isGreaterThan(1000 - 300); // margin of error of 5 minutes since TTLs decrease + + // not-frozen collection, not supported + Assertions.assertThatThrownBy(() -> coordinator.execute(withKeyspace("SELECT writetime(s) FROM %s.t"), ALL)) + .hasMessageContaining("Cannot use selection function writeTime on non-frozen collection s"); + Assertions.assertThatThrownBy(() -> coordinator.execute(withKeyspace("SELECT maxwritetime(s) FROM %s.t"), ALL)) + .hasMessageContaining("Unknown function 'maxwritetime'"); + Assertions.assertThatThrownBy(() -> coordinator.execute(withKeyspace("SELECT ttl(s) FROM %s.t"), ALL)) + .hasMessageContaining("Cannot use selection function ttl on non-frozen collection s"); + } + + private void assertPost42Behaviour(ICoordinator coordinator) + { + // regular column, fully supported + assertRows(coordinator.execute(withKeyspace("SELECT writetime(v) FROM %s.t"), ALL), row(2L)); + assertRows(coordinator.execute(withKeyspace("SELECT maxwritetime(v) FROM %s.t"), ALL), row(2L)); + Assertions.assertThat((Integer) coordinator.execute(withKeyspace("SELECT ttl(v) FROM %s.t"), ALL)[0][0]) + .isLessThanOrEqualTo(2000).isGreaterThan(2000 - 300); // margin of error of 5 minutes since TTLs decrease + + // frozen collection, fully supported + assertRows(coordinator.execute(withKeyspace("SELECT writetime(fs) FROM %s.t"), ALL), row(1L)); + assertRows(coordinator.execute(withKeyspace("SELECT maxwritetime(fs) FROM %s.t"), ALL), row(1L)); + Assertions.assertThat((Integer) coordinator.execute(withKeyspace("SELECT ttl(fs) FROM %s.t"), ALL)[0][0]) + .isLessThanOrEqualTo(1000).isGreaterThan(1000 - 300); // margin of error of 5 minutes since TTLs decrease + + // not-frozen collection, fully supported + assertRows(coordinator.execute(withKeyspace("SELECT writetime(s) FROM %s.t"), ALL), row(Arrays.asList(1L, 1L, 2L, 2L))); + assertRows(coordinator.execute(withKeyspace("SELECT maxwritetime(s) FROM %s.t"), ALL), row(2L)); + Assertions.assertThat(coordinator.execute(withKeyspace("SELECT ttl(s) FROM %s.t"), ALL)[0][0]) + .matches(l -> l instanceof List && ((List) l).size() == 4); + } +} diff --git a/test/unit/org/apache/cassandra/cql3/selection/SelectorSerializationTest.java b/test/unit/org/apache/cassandra/cql3/selection/SelectorSerializationTest.java index 4eadb95d6d26..530cb6761403 100644 --- a/test/unit/org/apache/cassandra/cql3/selection/SelectorSerializationTest.java +++ b/test/unit/org/apache/cassandra/cql3/selection/SelectorSerializationTest.java @@ -36,6 +36,7 @@ import org.apache.cassandra.io.util.DataInputBuffer; import org.apache.cassandra.io.util.DataOutputBuffer; import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.schema.ColumnMetadata; import org.apache.cassandra.schema.KeyspaceMetadata; import org.apache.cassandra.schema.Schema; import org.apache.cassandra.schema.TableMetadata; @@ -60,9 +61,10 @@ public void testSerDes() throws IOException checkSerialization(table.getColumn(new ColumnIdentifier("c1", false)), table); // Test WritetimeOrTTLSelector serialization - checkSerialization(new Selectable.WritetimeOrTTL(table.getColumn(new ColumnIdentifier("v", false)), Selectable.WritetimeOrTTL.Kind.WRITE_TIME), table); - checkSerialization(new Selectable.WritetimeOrTTL(table.getColumn(new ColumnIdentifier("v", false)), Selectable.WritetimeOrTTL.Kind.TTL), table); - checkSerialization(new Selectable.WritetimeOrTTL(table.getColumn(new ColumnIdentifier("v", false)), Selectable.WritetimeOrTTL.Kind.MAX_WRITE_TIME), table); + ColumnMetadata column = table.getColumn(new ColumnIdentifier("v", false)); + checkSerialization(new Selectable.WritetimeOrTTL(column, column, Selectable.WritetimeOrTTL.Kind.WRITE_TIME), table); + checkSerialization(new Selectable.WritetimeOrTTL(column, column, Selectable.WritetimeOrTTL.Kind.TTL), table); + checkSerialization(new Selectable.WritetimeOrTTL(column, column, Selectable.WritetimeOrTTL.Kind.MAX_WRITE_TIME), table); // Test ListSelector serialization checkSerialization(new Selectable.WithList(asList(table.getColumn(new ColumnIdentifier("v", false)), diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/CollectionsTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/CollectionsTest.java index 822ff55ac49b..0b6238fe1c3a 100644 --- a/test/unit/org/apache/cassandra/cql3/validation/entities/CollectionsTest.java +++ b/test/unit/org/apache/cassandra/cql3/validation/entities/CollectionsTest.java @@ -577,18 +577,6 @@ public void testAlterCollections() throws Throwable execute("ALTER TABLE %s ADD alist list"); } - /** - * Migrated from cql_tests.py:TestCQL.collection_function_test() - */ - @Test - public void testFunctionsOnCollections() throws Throwable - { - createTable("CREATE TABLE %s (k int PRIMARY KEY, l set)"); - - assertInvalid("SELECT ttl(l) FROM %s WHERE k = 0"); - assertInvalid("SELECT writetime(l) FROM %s WHERE k = 0"); - } - @Test public void testInRestrictionWithCollection() throws Throwable { diff --git a/test/unit/org/apache/cassandra/cql3/validation/entities/WritetimeOrTTLTest.java b/test/unit/org/apache/cassandra/cql3/validation/entities/WritetimeOrTTLTest.java index 16bab23986bf..4f956b1f8ddf 100644 --- a/test/unit/org/apache/cassandra/cql3/validation/entities/WritetimeOrTTLTest.java +++ b/test/unit/org/apache/cassandra/cql3/validation/entities/WritetimeOrTTLTest.java @@ -18,17 +18,29 @@ package org.apache.cassandra.cql3.validation.entities; +import java.util.Arrays; +import java.util.List; +import java.util.Objects; + import org.junit.Test; import org.apache.cassandra.cql3.CQLTester; import org.apache.cassandra.cql3.UntypedResultSet; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.ListType; +import org.apache.cassandra.db.marshal.LongType; import org.apache.cassandra.exceptions.InvalidRequestException; +import static java.lang.String.format; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; import static org.junit.Assert.assertNotNull; import static org.junit.Assert.assertNull; import static org.junit.Assert.assertTrue; -import static java.lang.String.format; +/** + * Tests for CQL's {@code WRITETIME}, {@code MAXWRITETIME} and {@code TTL} selection functions. + */ public class WritetimeOrTTLTest extends CQLTester { private static final long TIMESTAMP_1 = 1; @@ -70,7 +82,42 @@ public void testSimple() throws Throwable public void testList() throws Throwable { createTable("CREATE TABLE %s (k int PRIMARY KEY, l list)"); - assertInvalidMultiCellSelection("l", true); + + // Null column + execute("INSERT INTO %s (k) VALUES (1) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("l", NO_TIMESTAMP, NO_TTL); + + // Create empty + execute("INSERT INTO %s (k, l) VALUES (1, []) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("l", NO_TIMESTAMP, NO_TTL); + + // Create with a single element without TTL + execute("INSERT INTO %s (k, l) VALUES (1, [1]) USING TIMESTAMP ?", TIMESTAMP_1); + assertWritetimeAndTTL("l", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + + // Add a new element to the list with a new timestamp and a TTL + execute("UPDATE %s USING TIMESTAMP ? AND TTL ? SET l=l+[2] WHERE k=1", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("l", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(NO_TTL, TTL_2)); + + assertInvalidListElementSelection("l[0]", "l"); + assertInvalidListSliceSelection("l[..0]", "l"); + assertInvalidListSliceSelection("l[0..]", "l"); + assertInvalidListSliceSelection("l[1..1]", "l"); + assertInvalidListSliceSelection("l[1..2]", "l"); + + // Read multiple rows to verify selector reset + execute("TRUNCATE TABLE %s"); + execute("INSERT INTO %s (k, l) VALUES (1, [1, 2, 3]) USING TIMESTAMP ?", TIMESTAMP_1); + execute("INSERT INTO %s (k, l) VALUES (2, [1, 2]) USING TIMESTAMP ?", TIMESTAMP_2); + execute("INSERT INTO %s (k, l) VALUES (3, [1]) USING TIMESTAMP ?", TIMESTAMP_1); + execute("INSERT INTO %s (k, l) VALUES (4, []) USING TIMESTAMP ?", TIMESTAMP_2); + execute("INSERT INTO %s (k, l) VALUES (5, null) USING TIMESTAMP ?", TIMESTAMP_2); + assertRows("SELECT k, WRITETIME(l) FROM %s", + row(5, NO_TIMESTAMP), + row(1, timestamps(TIMESTAMP_1, TIMESTAMP_1, TIMESTAMP_1)), + row(2, timestamps(TIMESTAMP_2, TIMESTAMP_2)), + row(4, NO_TIMESTAMP), + row(3, timestamps(TIMESTAMP_1))); } @Test @@ -86,6 +133,9 @@ public void testFrozenList() throws Throwable execute("INSERT INTO %s (k, v) VALUES (1, []) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); assertWritetimeAndTTL("v", TIMESTAMP_1, TTL_1); + // truncate, since previous columns would win on reconcilliation because of their TTL (CASSANDRA-14592) + execute("TRUNCATE TABLE %s"); + // Update with a single element without TTL execute("INSERT INTO %s (k, v) VALUES (1, [1]) USING TIMESTAMP ?", TIMESTAMP_1); assertWritetimeAndTTL("v", TIMESTAMP_1, NO_TTL); @@ -93,13 +143,119 @@ public void testFrozenList() throws Throwable // Add a new element to the list with a new timestamp and a TTL execute("INSERT INTO %s (k, v) VALUES (1, [1, 2, 3]) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_2, TTL_2); assertWritetimeAndTTL("v", TIMESTAMP_2, TTL_2); + + assertInvalidListElementSelection("v[1]", "v"); + assertInvalidListSliceSelection("v[..0]", "v"); + assertInvalidListSliceSelection("v[0..]", "v"); + assertInvalidListSliceSelection("v[1..1]", "v"); + assertInvalidListSliceSelection("v[1..2]", "v"); + + // Read multiple rows to verify selector reset + execute("TRUNCATE TABLE %s"); + execute("INSERT INTO %s (k, v) VALUES (1, [1, 2, 3]) USING TIMESTAMP ?", TIMESTAMP_1); + execute("INSERT INTO %s (k, v) VALUES (2, [1, 2]) USING TIMESTAMP ?", TIMESTAMP_2); + execute("INSERT INTO %s (k, v) VALUES (3, [1]) USING TIMESTAMP ?", TIMESTAMP_1); + execute("INSERT INTO %s (k, v) VALUES (4, []) USING TIMESTAMP ?", TIMESTAMP_2); + execute("INSERT INTO %s (k, v) VALUES (5, null) USING TIMESTAMP ?", TIMESTAMP_2); + assertRows("SELECT k, WRITETIME(v) FROM %s", + row(5, NO_TIMESTAMP), + row(1, TIMESTAMP_1), + row(2, TIMESTAMP_2), + row(4, TIMESTAMP_2), + row(3, TIMESTAMP_1)); } @Test public void testSet() throws Throwable { createTable("CREATE TABLE %s (k int PRIMARY KEY, s set)"); - assertInvalidMultiCellSelection("s", true); + + // Null column + execute("INSERT INTO %s (k) VALUES (1) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("s", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[0..]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[0..0]", NO_TIMESTAMP, NO_TTL); + + // Create empty + execute("INSERT INTO %s (k, s) VALUES (1, {}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("s", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[0..]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[0..0]", NO_TIMESTAMP, NO_TTL); + + // Update with a single element without TTL + execute("INSERT INTO %s (k, s) VALUES (1, {1}) USING TIMESTAMP ?", TIMESTAMP_1); + assertWritetimeAndTTL("s", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("s[0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[1]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("s[2]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[..1]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("s[..2]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("s[0..]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("s[1..]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("s[2..]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[0..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[0..1]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("s[1..1]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("s[1..2]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("s[2..2]", NO_TIMESTAMP, NO_TTL); + + // Add a new element to the set with a new timestamp and a TTL + execute("UPDATE %s USING TIMESTAMP ? AND TTL ? SET s=s+{2} WHERE k=1", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("s", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(NO_TTL, TTL_2)); + assertWritetimeAndTTL("s[0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[1]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("s[2]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("s[3]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[..1]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("s[..2]", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(NO_TTL, TTL_2)); + assertWritetimeAndTTL("s[..3]", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(NO_TTL, TTL_2)); + assertWritetimeAndTTL("s[0..]", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(NO_TTL, TTL_2)); + assertWritetimeAndTTL("s[1..]", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(NO_TTL, TTL_2)); + assertWritetimeAndTTL("s[2..]", timestamps(TIMESTAMP_2), ttls(TTL_2)); + assertWritetimeAndTTL("s[3..]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[0..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[0..1]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("s[0..2]", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(NO_TTL, TTL_2)); + assertWritetimeAndTTL("s[0..3]", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(NO_TTL, TTL_2)); + assertWritetimeAndTTL("s[1..1]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("s[1..2]", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(NO_TTL, TTL_2)); + assertWritetimeAndTTL("s[1..3]", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(NO_TTL, TTL_2)); + assertWritetimeAndTTL("s[2..2]", timestamps(TIMESTAMP_2), ttls(TTL_2)); + assertWritetimeAndTTL("s[2..3]", timestamps(TIMESTAMP_2), ttls(TTL_2)); + assertWritetimeAndTTL("s[3..3]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[3..4]", NO_TIMESTAMP, NO_TTL); + + // Combine timestamp selection with other selections and orders + assertRows("SELECT k, WRITETIME(s[1]) FROM %s", row(1, TIMESTAMP_1)); + assertRows("SELECT WRITETIME(s[1]), k FROM %s", row(TIMESTAMP_1, 1)); + assertRows("SELECT WRITETIME(s[1]), WRITETIME(s[2]) FROM %s", row(TIMESTAMP_1, TIMESTAMP_2)); + assertRows("SELECT WRITETIME(s[2]), WRITETIME(s[1]) FROM %s", row(TIMESTAMP_2, TIMESTAMP_1)); + + // Read multiple rows to verify selector reset + execute("TRUNCATE TABLE %s"); + execute("INSERT INTO %s (k, s) VALUES (1, {1, 2, 3}) USING TIMESTAMP ?", TIMESTAMP_1); + execute("INSERT INTO %s (k, s) VALUES (2, {1, 2}) USING TIMESTAMP ?", TIMESTAMP_2); + execute("INSERT INTO %s (k, s) VALUES (3, {1}) USING TIMESTAMP ?", TIMESTAMP_1); + execute("INSERT INTO %s (k, s) VALUES (4, {}) USING TIMESTAMP ?", TIMESTAMP_2); + execute("INSERT INTO %s (k, s) VALUES (5, null) USING TIMESTAMP ?", TIMESTAMP_2); + assertRows("SELECT k, WRITETIME(s) FROM %s", + row(5, NO_TIMESTAMP), + row(1, timestamps(TIMESTAMP_1, TIMESTAMP_1, TIMESTAMP_1)), + row(2, timestamps(TIMESTAMP_2, TIMESTAMP_2)), + row(4, NO_TIMESTAMP), + row(3, timestamps(TIMESTAMP_1))); + assertRows("SELECT k, WRITETIME(s[1]) FROM %s", + row(5, NO_TIMESTAMP), + row(1, TIMESTAMP_1), + row(2, TIMESTAMP_2), + row(4, NO_TIMESTAMP), + row(3, TIMESTAMP_1)); } @Test @@ -110,25 +266,182 @@ public void testFrozenSet() throws Throwable // Null column execute("INSERT INTO %s (k) VALUES (1) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); assertWritetimeAndTTL("s", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[0..]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[0..0]", NO_TIMESTAMP, NO_TTL); // Create empty execute("INSERT INTO %s (k, s) VALUES (1, {}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); assertWritetimeAndTTL("s", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("s[..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[0..]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[0..0]", NO_TIMESTAMP, NO_TTL); + + // truncate, since previous columns would win on reconcilliation because of their TTL (CASSANDRA-14592) + execute("TRUNCATE TABLE %s"); // Update with a single element without TTL execute("INSERT INTO %s (k, s) VALUES (1, {1}) USING TIMESTAMP ?", TIMESTAMP_1); assertWritetimeAndTTL("s", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("s[0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[1]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("s[2]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[..1]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("s[..2]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("s[0..]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("s[1..]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("s[2..]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[0..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[0..1]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("s[0..2]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("s[1..1]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("s[1..2]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("s[2..2]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[2..3]", NO_TIMESTAMP, NO_TTL); // Add a new element to the set with a new timestamp and a TTL execute("INSERT INTO %s (k, s) VALUES (1, {1, 2}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_2, TTL_2); assertWritetimeAndTTL("s", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("s[0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[1]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("s[2]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("s[3]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[..1]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("s[..2]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("s[..3]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("s[0..]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("s[1..]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("s[2..]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("s[3..]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[0..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[0..1]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("s[0..2]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("s[0..3]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("s[1..1]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("s[1..2]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("s[1..3]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("s[2..2]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("s[2..3]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("s[3..3]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("s[3..4]", NO_TIMESTAMP, NO_TTL); + + // Read multiple rows to verify selector reset + execute("TRUNCATE TABLE %s"); + execute("INSERT INTO %s (k, s) VALUES (1, {1, 2, 3}) USING TIMESTAMP ?", TIMESTAMP_1); + execute("INSERT INTO %s (k, s) VALUES (2, {1, 2}) USING TIMESTAMP ?", TIMESTAMP_2); + execute("INSERT INTO %s (k, s) VALUES (3, {1}) USING TIMESTAMP ?", TIMESTAMP_1); + execute("INSERT INTO %s (k, s) VALUES (4, {}) USING TIMESTAMP ?", TIMESTAMP_2); + execute("INSERT INTO %s (k, s) VALUES (5, null) USING TIMESTAMP ?", TIMESTAMP_2); + assertRows("SELECT k, WRITETIME(s) FROM %s", + row(5, NO_TIMESTAMP), + row(1, TIMESTAMP_1), + row(2, TIMESTAMP_2), + row(4, TIMESTAMP_2), + row(3, TIMESTAMP_1)); + assertRows("SELECT k, WRITETIME(s[1]) FROM %s", + row(5, NO_TIMESTAMP), + row(1, TIMESTAMP_1), + row(2, TIMESTAMP_2), + row(4, NO_TIMESTAMP), + row(3, TIMESTAMP_1)); } @Test public void testMap() throws Throwable { createTable("CREATE TABLE %s (k int PRIMARY KEY, m map)"); - assertInvalidMultiCellSelection("m", true); + + // Null column + execute("INSERT INTO %s (k) VALUES (1) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("m", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[0..]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[0..0]", NO_TIMESTAMP, NO_TTL); + + // Create empty + execute("INSERT INTO %s (k, m) VALUES (1, {}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("m", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[0..]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[0..0]", NO_TIMESTAMP, NO_TTL); + + // Update with a single element without TTL + execute("INSERT INTO %s (k, m) VALUES (1, {1:10}) USING TIMESTAMP ?", TIMESTAMP_1); + assertWritetimeAndTTL("m", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("m[1]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("m[2]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[..1]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("m[..2]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("m[0..]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("m[1..]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("m[2..]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[0..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[0..1]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("m[0..2]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("m[1..1]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("m[1..2]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("m[2..2]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[2..3]", NO_TIMESTAMP, NO_TTL); + + // Add a new element to the map with a new timestamp and a TTL + execute("UPDATE %s USING TIMESTAMP ? AND TTL ? SET m=m+{2:20} WHERE k=1", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("m", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(NO_TTL, TTL_2)); + assertWritetimeAndTTL("m[0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[1]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("m[2]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("m[3]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[..1]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("m[..2]", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(NO_TTL, TTL_2)); + assertWritetimeAndTTL("m[..3]", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(NO_TTL, TTL_2)); + assertWritetimeAndTTL("m[0..]", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(NO_TTL, TTL_2)); + assertWritetimeAndTTL("m[1..]", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(NO_TTL, TTL_2)); + assertWritetimeAndTTL("m[2..]", timestamps(TIMESTAMP_2), ttls(TTL_2)); + assertWritetimeAndTTL("m[3..]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[0..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[0..1]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("m[0..2]", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(NO_TTL, TTL_2)); + assertWritetimeAndTTL("m[0..3]", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(NO_TTL, TTL_2)); + assertWritetimeAndTTL("m[1..1]", timestamps(TIMESTAMP_1), ttls(NO_TTL)); + assertWritetimeAndTTL("m[1..2]", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(NO_TTL, TTL_2)); + assertWritetimeAndTTL("m[1..3]", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(NO_TTL, TTL_2)); + assertWritetimeAndTTL("m[2..2]", timestamps(TIMESTAMP_2), ttls(TTL_2)); + assertWritetimeAndTTL("m[2..3]", timestamps(TIMESTAMP_2), ttls(TTL_2)); + assertWritetimeAndTTL("m[3..3]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[3..4]", NO_TIMESTAMP, NO_TTL); + + // Combine timestamp selection with other selections and orders + assertRows("SELECT k, WRITETIME(m[1]) FROM %s", row(1, TIMESTAMP_1)); + assertRows("SELECT WRITETIME(m[1]), k FROM %s", row(TIMESTAMP_1, 1)); + assertRows("SELECT WRITETIME(m[1]), WRITETIME(m[2]) FROM %s", row(TIMESTAMP_1, TIMESTAMP_2)); + assertRows("SELECT WRITETIME(m[2]), WRITETIME(m[1]) FROM %s", row(TIMESTAMP_2, TIMESTAMP_1)); + + // Read multiple rows to verify selector reset + execute("TRUNCATE TABLE %s"); + execute("INSERT INTO %s (k, m) VALUES (1, {1:10, 2:20, 3:30}) USING TIMESTAMP ?", TIMESTAMP_1); + execute("INSERT INTO %s (k, m) VALUES (2, {1:10, 2:20}) USING TIMESTAMP ?", TIMESTAMP_2); + execute("INSERT INTO %s (k, m) VALUES (3, {1:10}) USING TIMESTAMP ?", TIMESTAMP_1); + execute("INSERT INTO %s (k, m) VALUES (4, {}) USING TIMESTAMP ?", TIMESTAMP_2); + execute("INSERT INTO %s (k, m) VALUES (5, null) USING TIMESTAMP ?", TIMESTAMP_2); + assertRows("SELECT k, WRITETIME(m) FROM %s", + row(5, NO_TIMESTAMP), + row(1, timestamps(TIMESTAMP_1, TIMESTAMP_1, TIMESTAMP_1)), + row(2, timestamps(TIMESTAMP_2, TIMESTAMP_2)), + row(4, NO_TIMESTAMP), + row(3, timestamps(TIMESTAMP_1))); + assertRows("SELECT k, WRITETIME(m[1]) FROM %s", + row(5, NO_TIMESTAMP), + row(1, TIMESTAMP_1), + row(2, TIMESTAMP_2), + row(4, NO_TIMESTAMP), + row(3, TIMESTAMP_1)); } @Test @@ -139,18 +452,244 @@ public void testFrozenMap() throws Throwable // Null column execute("INSERT INTO %s (k) VALUES (1) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); assertWritetimeAndTTL("m", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[0..]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[0..0]", NO_TIMESTAMP, NO_TTL); // Create empty execute("INSERT INTO %s (k, m) VALUES (1, {}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); assertWritetimeAndTTL("m", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("m[0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[0..]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[0..0]", NO_TIMESTAMP, NO_TTL); + + // truncate, since previous columns would win on reconcilliation because of their TTL (CASSANDRA-14592) + execute("TRUNCATE TABLE %s"); // Create with a single element without TTL execute("INSERT INTO %s (k, m) VALUES (1, {1:10}) USING TIMESTAMP ?", TIMESTAMP_1); assertWritetimeAndTTL("m", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("m[0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[1]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("m[2]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[..1]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("m[..2]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("m[0..]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("m[1..]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("m[2..]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[0..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[0..1]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("m[0..2]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("m[1..1]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("m[1..2]", TIMESTAMP_1, NO_TTL); + assertWritetimeAndTTL("m[2..2]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[2..3]", NO_TIMESTAMP, NO_TTL); // Add a new element to the map with a new timestamp and a TTL execute("INSERT INTO %s (k, m) VALUES (1, {1:10, 2:20}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_2, TTL_2); assertWritetimeAndTTL("m", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("m[0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[1]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("m[2]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("m[3]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[..1]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("m[..2]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("m[..3]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("m[0..]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("m[1..]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("m[2..]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("m[3..]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[0..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[0..1]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("m[0..2]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("m[0..3]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("m[1..1]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("m[1..2]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("m[1..3]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("m[2..2]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("m[2..3]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("m[3..3]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("m[3..4]", NO_TIMESTAMP, NO_TTL); + + // Read multiple rows to verify selector reset + execute("TRUNCATE TABLE %s"); + execute("INSERT INTO %s (k, m) VALUES (1, {1:10, 2:20, 3:30}) USING TIMESTAMP ?", TIMESTAMP_1); + execute("INSERT INTO %s (k, m) VALUES (2, {1:10, 2:20}) USING TIMESTAMP ?", TIMESTAMP_2); + execute("INSERT INTO %s (k, m) VALUES (3, {1:10}) USING TIMESTAMP ?", TIMESTAMP_1); + execute("INSERT INTO %s (k, m) VALUES (4, {}) USING TIMESTAMP ?", TIMESTAMP_2); + execute("INSERT INTO %s (k, m) VALUES (5, null) USING TIMESTAMP ?", TIMESTAMP_2); + assertRows("SELECT k, WRITETIME(m) FROM %s", + row(5, NO_TIMESTAMP), + row(1, TIMESTAMP_1), + row(2, TIMESTAMP_2), + row(4, TIMESTAMP_2), + row(3, TIMESTAMP_1)); + assertRows("SELECT k, WRITETIME(m[1]) FROM %s", + row(5, NO_TIMESTAMP), + row(1, TIMESTAMP_1), + row(2, TIMESTAMP_2), + row(4, NO_TIMESTAMP), + row(3, TIMESTAMP_1)); + } + + @Test + public void testNestedCollections() throws Throwable + { + createTable("CREATE TABLE %s (k int PRIMARY KEY, v map>>)"); + + // Null column + execute("INSERT INTO %s (k) VALUES (1) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0][0]", NO_TIMESTAMP, NO_TTL); + + execute("INSERT INTO %s (k, v) VALUES (1, {1:{1,2}}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + execute("UPDATE %s USING TIMESTAMP ? AND TTL ? SET v=v+{2:{1, 2}} WHERE k=1", TIMESTAMP_2, TTL_2); + + assertWritetimeAndTTL("v", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(TTL_1, TTL_2)); + + assertWritetimeAndTTL("v[0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[1]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[2]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("v[3]", NO_TIMESTAMP, NO_TTL); + + assertWritetimeAndTTL("v[0..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0..1]", timestamps(TIMESTAMP_1), ttls(TTL_1)); + assertWritetimeAndTTL("v[0..2]", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(TTL_1, TTL_2)); + assertWritetimeAndTTL("v[0..3]", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(TTL_1, TTL_2)); + assertWritetimeAndTTL("v[1..3]", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(TTL_1, TTL_2)); + assertWritetimeAndTTL("v[2..3]", timestamps(TIMESTAMP_2), ttls(TTL_2)); + assertWritetimeAndTTL("v[3..3]", NO_TIMESTAMP, NO_TTL); + + assertWritetimeAndTTL("v[0][0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0][1]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0][2]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0][3]", NO_TIMESTAMP, NO_TTL); + + assertWritetimeAndTTL("v[1][0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[1][1]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[1][2]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[1][3]", NO_TIMESTAMP, NO_TTL); + + assertWritetimeAndTTL("v[2][0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[2][1]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("v[2][2]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("v[2][3]", NO_TIMESTAMP, NO_TTL); + + assertWritetimeAndTTL("v[3][0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[3][1]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[3][2]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[3][3]", NO_TIMESTAMP, NO_TTL); + + assertWritetimeAndTTL("v[0][0..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0][0..1]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0][1..2]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0][2..3]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0][3..4]", NO_TIMESTAMP, NO_TTL); + + assertWritetimeAndTTL("v[1][0..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[1][0..1]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[1][1..2]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[1][2..3]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[1][3..4]", NO_TIMESTAMP, NO_TTL); + + assertWritetimeAndTTL("v[2][0..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[2][0..1]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("v[2][1..2]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("v[2][2..3]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("v[2][3..4]", NO_TIMESTAMP, NO_TTL); + + assertWritetimeAndTTL("v[3][0..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[3][0..1]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[3][1..2]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[3][2..3]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[3][3..4]", NO_TIMESTAMP, NO_TTL); + + assertWritetimeAndTTL("v[0..1][0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0..1][1]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[0..1][2]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0..2][0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0..2][1]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[0..2][2]", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("v[0..2][3]", NO_TIMESTAMP, NO_TTL); + } + + @Test + public void testFrozenNestedCollections() throws Throwable + { + createTable("CREATE TABLE %s (k int PRIMARY KEY, v frozen>>>)"); + execute("INSERT INTO %s (k, v) VALUES (1, {1:{1,2}, 2:{1,2}}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + + assertWritetimeAndTTL("v", TIMESTAMP_1, TTL_1); + + assertWritetimeAndTTL("v[0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[1]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[2]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[3]", NO_TIMESTAMP, NO_TTL); + + assertWritetimeAndTTL("v[0][0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0][1]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0][2]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0][3]", NO_TIMESTAMP, NO_TTL); + + assertWritetimeAndTTL("v[1][0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[1][1]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[1][2]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[1][3]", NO_TIMESTAMP, NO_TTL); + + assertWritetimeAndTTL("v[2][0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[2][1]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[2][2]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[2][3]", NO_TIMESTAMP, NO_TTL); + + assertWritetimeAndTTL("v[3][0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[3][1]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[3][2]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[3][3]", NO_TIMESTAMP, NO_TTL); + + assertWritetimeAndTTL("v[0][0..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0][0..1]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0][1..2]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0][2..3]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0][3..4]", NO_TIMESTAMP, NO_TTL); + + assertWritetimeAndTTL("v[1][0..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[1][0..1]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[1][1..2]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[1][2..3]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[1][3..4]", NO_TIMESTAMP, NO_TTL); + + assertWritetimeAndTTL("v[2][0..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[2][0..1]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[2][1..2]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[2][2..3]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[2][3..4]", NO_TIMESTAMP, NO_TTL); + + assertWritetimeAndTTL("v[3][0..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[3][0..1]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[3][1..2]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[3][2..3]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[3][3..4]", NO_TIMESTAMP, NO_TTL); + + assertWritetimeAndTTL("v[0..0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0..1]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[1..2]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[2..3]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[3..4]", NO_TIMESTAMP, NO_TTL); + + assertWritetimeAndTTL("v[0..0][0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0..0][1]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0..1][0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[0..1][1]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[0..1][2]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[1..2][0]", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("v[1..2][1]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[1..2][2]", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("v[1..2][3]", NO_TIMESTAMP, NO_TTL); } @Test @@ -158,7 +697,58 @@ public void testUDT() throws Throwable { String type = createType("CREATE TYPE %s (f1 int, f2 int)"); createTable("CREATE TABLE %s (k int PRIMARY KEY, t " + type + ')'); - assertInvalidMultiCellSelection("t", false); + + // Null column + execute("INSERT INTO %s (k) VALUES (0) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", NO_TIMESTAMP, NO_TTL); + + // Both fields are empty + execute("INSERT INTO %s (k, t) VALUES (0, {f1:null, f2:null}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=0", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1", "k=0", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", "k=0", NO_TIMESTAMP, NO_TTL); + assertRows("SELECT k, WRITETIME(t.f1), WRITETIME(t.f2) FROM %s WHERE k=0", row(0, NO_TIMESTAMP, NO_TIMESTAMP)); + + // Only the first field is set + execute("INSERT INTO %s (k, t) VALUES (1, {f1:1, f2:null}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=1", timestamps(TIMESTAMP_1, NO_TIMESTAMP), ttls(TTL_1, NO_TTL)); + assertWritetimeAndTTL("t.f1", "k=1", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2", "k=1", NO_TIMESTAMP, NO_TTL); + assertRows("SELECT k, WRITETIME(t.f1), WRITETIME(t.f2) FROM %s WHERE k=1", row(1, TIMESTAMP_1, NO_TIMESTAMP)); + assertRows("SELECT k, WRITETIME(t.f2), WRITETIME(t.f1) FROM %s WHERE k=1", row(1, NO_TIMESTAMP, TIMESTAMP_1)); + + // Only the second field is set + execute("INSERT INTO %s (k, t) VALUES (2, {f1:null, f2:2}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=2", timestamps(NO_TIMESTAMP, TIMESTAMP_1), ttls(NO_TTL, TTL_1)); + assertWritetimeAndTTL("t.f1", "k=2", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", "k=2", TIMESTAMP_1, TTL_1); + assertRows("SELECT k, WRITETIME(t.f1), WRITETIME(t.f2) FROM %s WHERE k=2", row(2, NO_TIMESTAMP, TIMESTAMP_1)); + assertRows("SELECT k, WRITETIME(t.f2), WRITETIME(t.f1) FROM %s WHERE k=2", row(2, TIMESTAMP_1, NO_TIMESTAMP)); + + // Both fields are set + execute("INSERT INTO %s (k, t) VALUES (3, {f1:1, f2:2}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=3", timestamps(TIMESTAMP_1, TIMESTAMP_1), ttls(TTL_1, TTL_1)); + assertWritetimeAndTTL("t.f1", "k=3", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2", "k=3", TIMESTAMP_1, TTL_1); + assertRows("SELECT k, WRITETIME(t.f1), WRITETIME(t.f2) FROM %s WHERE k=3", row(3, TIMESTAMP_1, TIMESTAMP_1)); + + // Having only the first field set, update the second field + execute("UPDATE %s USING TIMESTAMP ? AND TTL ? SET t.f2=2 WHERE k=1", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("t", "k=1", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(TTL_1, TTL_2)); + assertWritetimeAndTTL("t.f1", "k=1", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2", "k=1", TIMESTAMP_2, TTL_2); + assertRows("SELECT k, WRITETIME(t.f1), WRITETIME(t.f2) FROM %s WHERE k=1", row(1, TIMESTAMP_1, TIMESTAMP_2)); + assertRows("SELECT k, WRITETIME(t.f2), WRITETIME(t.f1) FROM %s WHERE k=1", row(1, TIMESTAMP_2, TIMESTAMP_1)); + + // Having only the second field set, update the second field + execute("UPDATE %s USING TIMESTAMP ? AND TTL ? SET t.f1=1 WHERE k=2", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("t", "k=2", timestamps(TIMESTAMP_2, TIMESTAMP_1), ttls(TTL_2, TTL_1)); + assertWritetimeAndTTL("t.f1", "k=2", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("t.f2", "k=2", TIMESTAMP_1, TTL_1); + assertRows("SELECT k, WRITETIME(t.f1), WRITETIME(t.f2) FROM %s WHERE k=2", row(2, TIMESTAMP_2, TIMESTAMP_1)); + assertRows("SELECT k, WRITETIME(t.f2), WRITETIME(t.f1) FROM %s WHERE k=2", row(2, TIMESTAMP_1, TIMESTAMP_2)); } @Test @@ -170,22 +760,347 @@ public void testFrozenUDT() throws Throwable // Null column execute("INSERT INTO %s (k) VALUES (0) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); assertWritetimeAndTTL("t", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", NO_TIMESTAMP, NO_TTL); // Both fields are empty execute("INSERT INTO %s (k, t) VALUES (0, {f1:null, f2:null}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); assertWritetimeAndTTL("t", "k=0", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1", "k=0", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", "k=0", NO_TIMESTAMP, NO_TTL); + assertRows("SELECT k, WRITETIME(t.f1), WRITETIME(t.f2) FROM %s WHERE k=0", row(0, NO_TIMESTAMP, NO_TIMESTAMP)); // Only the first field is set execute("INSERT INTO %s (k, t) VALUES (1, {f1:1, f2:null}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); assertWritetimeAndTTL("t", "k=1", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1", "k=1", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2", "k=1", NO_TIMESTAMP, NO_TTL); + assertRows("SELECT k, WRITETIME(t.f1), WRITETIME(t.f2) FROM %s WHERE k=1", row(1, TIMESTAMP_1, NO_TIMESTAMP)); + assertRows("SELECT k, WRITETIME(t.f2), WRITETIME(t.f1) FROM %s WHERE k=1", row(1, NO_TIMESTAMP, TIMESTAMP_1)); // Only the second field is set execute("INSERT INTO %s (k, t) VALUES (2, {f1:null, f2:2}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); assertWritetimeAndTTL("t", "k=2", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1", "k=2", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", "k=2", TIMESTAMP_1, TTL_1); + assertRows("SELECT k, WRITETIME(t.f1), WRITETIME(t.f2) FROM %s WHERE k=2", row(2, NO_TIMESTAMP, TIMESTAMP_1)); + assertRows("SELECT k, WRITETIME(t.f2), WRITETIME(t.f1) FROM %s WHERE k=2", row(2, TIMESTAMP_1, NO_TIMESTAMP)); // Both fields are set execute("INSERT INTO %s (k, t) VALUES (3, {f1:1, f2:2}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); assertWritetimeAndTTL("t", "k=3", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1", "k=3", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2", "k=3", TIMESTAMP_1, TTL_1); + assertRows("SELECT k, WRITETIME(t.f1), WRITETIME(t.f2) FROM %s WHERE k=3", row(3, TIMESTAMP_1, TIMESTAMP_1)); + } + + @Test + public void testNestedUDTs() throws Throwable + { + String nestedType = createType("CREATE TYPE %s (f1 int, f2 int)"); + String type = createType(format("CREATE TYPE %%s (f1 frozen<%s>, f2 frozen<%)", nestedType)); + createTable("CREATE TABLE %s (k int PRIMARY KEY, t " + type + ')'); + + // Both fields are empty + execute("INSERT INTO %s (k, t) VALUES (1, {f1:null, f2:null}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=1", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1", "k=1", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f1", "k=1", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f2", "k=1", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", "k=1", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f1", "k=1", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f2", "k=1", NO_TIMESTAMP, NO_TTL); + + // Only the first field is set, no nested field is set + execute("INSERT INTO %s (k, t) VALUES (2, {f1:{f1:null,f2:null}, f2:null}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=2", timestamps(TIMESTAMP_1, NO_TIMESTAMP), ttls(TTL_1, NO_TTL)); + assertWritetimeAndTTL("t.f1", "k=2", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f1", "k=2", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f2", "k=2", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", "k=2", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f1", "k=2", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f2", "k=2", NO_TIMESTAMP, NO_TTL); + + // Only the first field is set, only the first nested field is set + execute("INSERT INTO %s (k, t) VALUES (3, {f1:{f1:1,f2:null}, f2:null}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=2", timestamps(TIMESTAMP_1, NO_TIMESTAMP), ttls(TTL_1, NO_TTL)); + assertWritetimeAndTTL("t.f1", "k=3", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f1", "k=3", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f2", "k=3", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", "k=3", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f1", "k=3", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f2", "k=3", NO_TIMESTAMP, NO_TTL); + + // Only the first field is set, only the second nested field is set + execute("INSERT INTO %s (k, t) VALUES (4, {f1:{f1:null,f2:2}, f2:null}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=4", timestamps(TIMESTAMP_1, NO_TIMESTAMP), ttls(TTL_1, NO_TTL)); + assertWritetimeAndTTL("t.f1", "k=4", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f1", "k=4", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f2", "k=4", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2", "k=4", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f1", "k=4", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f2", "k=4", NO_TIMESTAMP, NO_TTL); + + // Only the first field is set, both nested field are set + execute("INSERT INTO %s (k, t) VALUES (5, {f1:{f1:1,f2:2}, f2:null}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=5", timestamps(TIMESTAMP_1, NO_TIMESTAMP), ttls(TTL_1, NO_TTL)); + assertWritetimeAndTTL("t.f1", "k=5", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f1", "k=5", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f2", "k=5", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2", "k=5", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f1", "k=5", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f2", "k=5", NO_TIMESTAMP, NO_TTL); + + // Only the second field is set, no nested field is set + execute("INSERT INTO %s (k, t) VALUES (6, {f1:null, f2:{f1:null,f2:null}}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=6", timestamps(NO_TIMESTAMP, TIMESTAMP_1), ttls(NO_TTL, TTL_1)); + assertWritetimeAndTTL("t.f1", "k=6", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f1", "k=6", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f2", "k=6", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", "k=6", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2.f1", "k=6", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f2", "k=6", NO_TIMESTAMP, NO_TTL); + + // Only the second field is set, only the first nested field is set + execute("INSERT INTO %s (k, t) VALUES (7, {f1:null, f2:{f1:1,f2:null}}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=7", timestamps(NO_TIMESTAMP, TIMESTAMP_1), ttls(NO_TTL, TTL_1)); + assertWritetimeAndTTL("t.f1", "k=7", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f1", "k=7", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f2", "k=7", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", "k=7", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2.f1", "k=7", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2.f2", "k=7", NO_TIMESTAMP, NO_TTL); + + // Only the second field is set, only the second nested field is set + execute("INSERT INTO %s (k, t) VALUES (8, {f1:null, f2:{f1:null,f2:2}}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=8", timestamps(NO_TIMESTAMP, TIMESTAMP_1), ttls(NO_TTL, TTL_1)); + assertWritetimeAndTTL("t.f1", "k=8", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f1", "k=8", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f2", "k=8", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", "k=8", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2.f1", "k=8", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f2", "k=8", TIMESTAMP_1, TTL_1); + + // Only the second field is set, both nested field are set + execute("INSERT INTO %s (k, t) VALUES (9, {f1:null, f2:{f1:1,f2:2}}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=9", timestamps(NO_TIMESTAMP, TIMESTAMP_1), ttls(NO_TTL, TTL_1)); + assertWritetimeAndTTL("t.f1", "k=9", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f1", "k=9", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f2", "k=9", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", "k=9", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2.f1", "k=9", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2.f2", "k=9", TIMESTAMP_1, TTL_1); + + // Both fields are set, alternate fields are set + execute("INSERT INTO %s (k, t) VALUES (10, {f1:{f1:1}}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + execute("UPDATE %s USING TIMESTAMP ? AND TTL ? SET t.f2={f2:2} WHERE k=10", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("t", "k=10", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(TTL_1, TTL_2)); + assertWritetimeAndTTL("t.f1", "k=10", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f1", "k=10", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f2", "k=10", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", "k=10", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("t.f2.f1", "k=10", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f2", "k=10", TIMESTAMP_2, TTL_2); + + // Both fields are set, alternate fields are set + execute("INSERT INTO %s (k, t) VALUES (11, {f1:{f2:2}}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + execute("UPDATE %s USING TIMESTAMP ? AND TTL ? SET t.f2={f1:2} WHERE k=11", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("t", "k=11", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(TTL_1, TTL_2)); + assertWritetimeAndTTL("t.f1", "k=11", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f1", "k=11", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f2", "k=11", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2", "k=11", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("t.f2.f1", "k=11", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("t.f2.f2", "k=11", NO_TIMESTAMP, NO_TTL); + + // Both fields are set, all fields are set + execute("INSERT INTO %s (k, t) VALUES (12, {f1:{f1:1,f2:2}}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + execute("UPDATE %s USING TIMESTAMP ? AND TTL ? SET t.f2={f1:1,f2:2} WHERE k=12", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("t", "k=12", timestamps(TIMESTAMP_1, TIMESTAMP_2), ttls(TTL_1, TTL_2)); + assertWritetimeAndTTL("t.f1", "k=12", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f1", "k=12", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f2", "k=12", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2", "k=12", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("t.f2.f1", "k=12", TIMESTAMP_2, TTL_2); + assertWritetimeAndTTL("t.f2.f2", "k=12", TIMESTAMP_2, TTL_2); + } + + @Test + public void testFrozenNestedUDTs() throws Throwable + { + String nestedType = createType("CREATE TYPE %s (f1 int, f2 int)"); + String type = createType(format("CREATE TYPE %%s (f1 frozen<%s>, f2 frozen<%)", nestedType)); + createTable("CREATE TABLE %s (k int PRIMARY KEY, t frozen<" + type + ">)"); + + // Both fields are empty + execute("INSERT INTO %s (k, t) VALUES (1, {f1:null, f2:null}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=1", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1", "k=1", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f1", "k=1", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f2", "k=1", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", "k=1", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f1", "k=1", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f2", "k=1", NO_TIMESTAMP, NO_TTL); + + // Only the first field is set, no nested field is set + execute("INSERT INTO %s (k, t) VALUES (2, {f1:{f1:null,f2:null}, f2:null}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=2", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1", "k=2", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f1", "k=2", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f2", "k=2", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", "k=2", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f1", "k=2", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f2", "k=2", NO_TIMESTAMP, NO_TTL); + + // Only the first field is set, only the first nested field is set + execute("INSERT INTO %s (k, t) VALUES (3, {f1:{f1:1,f2:null}, f2:null}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=3", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1", "k=3", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f1", "k=3", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f2", "k=3", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", "k=3", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f1", "k=3", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f2", "k=3", NO_TIMESTAMP, NO_TTL); + + // Only the first field is set, only the second nested field is set + execute("INSERT INTO %s (k, t) VALUES (4, {f1:{f1:null,f2:2}, f2:null}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=4", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1", "k=4", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f1", "k=4", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f2", "k=4", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2", "k=4", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f1", "k=4", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f2", "k=4", NO_TIMESTAMP, NO_TTL); + + // Only the first field is set, both nested field are set + execute("INSERT INTO %s (k, t) VALUES (5, {f1:{f1:1,f2:2}, f2:null}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=5", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1", "k=5", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f1", "k=5", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f2", "k=5", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2", "k=5", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f1", "k=5", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f2", "k=5", NO_TIMESTAMP, NO_TTL); + + // Only the second field is set, no nested field is set + execute("INSERT INTO %s (k, t) VALUES (6, {f1:null, f2:{f1:null,f2:null}}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=6", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1", "k=6", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f1", "k=6", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f2", "k=6", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", "k=6", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2.f1", "k=6", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f2", "k=6", NO_TIMESTAMP, NO_TTL); + + // Only the second field is set, only the first nested field is set + execute("INSERT INTO %s (k, t) VALUES (7, {f1:null, f2:{f1:1,f2:null}}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=7", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1", "k=7", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f1", "k=7", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f2", "k=7", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", "k=7", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2.f1", "k=7", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2.f2", "k=7", NO_TIMESTAMP, NO_TTL); + + // Only the second field is set, only the second nested field is set + execute("INSERT INTO %s (k, t) VALUES (8, {f1:null, f2:{f1:null,f2:2}}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=8", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1", "k=8", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f1", "k=8", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f2", "k=8", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", "k=8", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2.f1", "k=8", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f2", "k=8", TIMESTAMP_1, TTL_1); + + // Only the second field is set, both nested field are set + execute("INSERT INTO %s (k, t) VALUES (9, {f1:null, f2:{f1:1,f2:2}}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=9", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1", "k=9", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f1", "k=9", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f2", "k=9", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", "k=9", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2.f1", "k=9", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2.f2", "k=9", TIMESTAMP_1, TTL_1); + + // Both fields are set, alternate fields are set + execute("INSERT INTO %s (k, t) VALUES (10, {f1:{f1:1}, f2:{f2:2}}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=10", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1", "k=10", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f1", "k=10", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f2", "k=10", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2", "k=10", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2.f1", "k=10", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f2.f2", "k=10", TIMESTAMP_1, TTL_1); + // Both fields are set, alternate fields are set + execute("INSERT INTO %s (k, t) VALUES (11, {f1:{f2:2}, f2:{f1:1}}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=11", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1", "k=11", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f1", "k=11", NO_TIMESTAMP, NO_TTL); + assertWritetimeAndTTL("t.f1.f2", "k=11", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2", "k=11", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2.f1", "k=11", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2.f2", "k=11", NO_TIMESTAMP, NO_TTL); + + // Both fields are set, all fields are set + execute("INSERT INTO %s (k, t) VALUES (12, {f1:{f1:1,f2:2},f2:{f1:1,f2:2}}) USING TIMESTAMP ? AND TTL ?", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t", "k=12", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1", "k=12", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f1", "k=12", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f1.f2", "k=12", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2", "k=12", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2.f1", "k=12", TIMESTAMP_1, TTL_1); + assertWritetimeAndTTL("t.f2.f2", "k=12", TIMESTAMP_1, TTL_1); + } + + @Test + public void testFunctions() throws Throwable + { + createTable("CREATE TABLE %s (k int PRIMARY KEY, v int, s set, fs frozen>)"); + execute("INSERT INTO %s (k, v, s, fs) VALUES (0, 0, {1, 2, 3}, {1, 2, 3}) USING TIMESTAMP 1 AND TTL 1000"); + execute("INSERT INTO %s (k, v, s, fs) VALUES (1, 1, {10, 20, 30}, {10, 20, 30}) USING TIMESTAMP 10 AND TTL 1000"); + execute("UPDATE %s USING TIMESTAMP 2 AND TTL 2000 SET s = s + {2, 3} WHERE k = 0"); + execute("UPDATE %s USING TIMESTAMP 20 AND TTL 2000 SET s = s + {20, 30} WHERE k = 1"); + + // Regular column + assertRows("SELECT min(v) FROM %s", row(0)); + assertRows("SELECT max(v) FROM %s", row(1)); + assertRows("SELECT writetime(v) FROM %s", row(10L), row(1L)); + assertRows("SELECT min(writetime(v)) FROM %s", row(1L)); + assertRows("SELECT max(writetime(v)) FROM %s", row(10L)); + assertRows("SELECT min(maxwritetime(v)) FROM %s", row(1L)); + assertRows("SELECT max(maxwritetime(v)) FROM %s", row(10L)); + + // Frozen collection + // Note that currently the tested system functions (min and max) return collections in their serialized format, + // this is something that we might want to improve in the future (CASSANDRA-17811). + assertRows("SELECT min(fs) FROM %s", row(ListType.getInstance(Int32Type.instance, false).decompose(Arrays.asList(1, 2, 3)))); + assertRows("SELECT max(fs) FROM %s", row(ListType.getInstance(Int32Type.instance, false).decompose(Arrays.asList(10, 20, 30)))); + assertRows("SELECT writetime(fs) FROM %s", row(10L), row(1L)); + assertRows("SELECT min(writetime(fs)) FROM %s", row(1L)); + assertRows("SELECT max(writetime(fs)) FROM %s", row(10L)); + assertRows("SELECT min(maxwritetime(fs)) FROM %s", row(1L)); + assertRows("SELECT max(maxwritetime(fs)) FROM %s", row(10L)); + + // Multi-cell collection + // Note that currently the tested system functions (min and max) return collections in their serialized format, + // this is something that we might want to improve in the future (CASSANDRA-17811). + assertRows("SELECT min(s) FROM %s", row(ListType.getInstance(Int32Type.instance, false).decompose(Arrays.asList(1, 2, 3)))); + assertRows("SELECT max(s) FROM %s", row(ListType.getInstance(Int32Type.instance, false).decompose(Arrays.asList(10, 20, 30)))); + assertRows("SELECT writetime(s) FROM %s", row(Arrays.asList(10L, 20L, 20L)), row(Arrays.asList(1L, 2L, 2L))); + assertRows("SELECT min(writetime(s)) FROM %s", row(ListType.getInstance(LongType.instance, false).decompose(Arrays.asList(1L, 2L, 2L)))); + assertRows("SELECT max(writetime(s)) FROM %s", row(ListType.getInstance(LongType.instance, false).decompose(Arrays.asList(10L, 20L, 20L)))); + assertRows("SELECT min(maxwritetime(s)) FROM %s", row(2L)); + assertRows("SELECT max(maxwritetime(s)) FROM %s", row(20L)); + } + + private static List ttls(Integer... a) + { + return Arrays.asList(a); + } + + private static List timestamps(Long... a) + { + return Arrays.asList(a); } private void assertRows(String query, Object[]... rows) throws Throwable @@ -198,27 +1113,34 @@ private void assertWritetimeAndTTL(String column, Long timestamp, Integer ttl) t assertWritetimeAndTTL(column, null, timestamp, ttl); } + private void assertWritetimeAndTTL(String column, List timestamps, List ttls) throws Throwable + { + assertWritetimeAndTTL(column, null, timestamps, ttls); + } + private void assertWritetimeAndTTL(String column, String where, Long timestamp, Integer ttl) throws Throwable { where = where == null ? "" : " WHERE " + where; // Verify write time - String writetimeQuery = String.format("SELECT WRITETIME(%s) FROM %%s %s", column, where); - assertRows(writetimeQuery, row(timestamp)); + assertRows(format("SELECT WRITETIME(%s) FROM %%s %s", column, where), row(timestamp)); // Verify max write time - String maxwritetimeQuery = String.format("SELECT MAXWRITETIME(%s) FROM %%s %s", column, where); - assertRows(maxwritetimeQuery, row(timestamp)); + assertRows(format("SELECT MAXWRITETIME(%s) FROM %%s %s", column, where), row(timestamp)); + + // Verify write time and max write time together + assertRows(format("SELECT WRITETIME(%s), MAXWRITETIME(%s) FROM %%s %s", column, column, where), + row(timestamp, timestamp)); // Verify ttl - UntypedResultSet rs = execute(String.format("SELECT TTL(%s) FROM %%s %s", column, where)); + UntypedResultSet rs = execute(format("SELECT TTL(%s) FROM %%s %s", column, where)); assertRowCount(rs, 1); UntypedResultSet.Row row = rs.one(); - String ttlColumn = String.format("ttl(%s)", column); + String ttlColumn = format("ttl(%s)", column); if (ttl == null) { - assertTTL(ttl, null); + assertFalse(row.has(ttlColumn)); } else { @@ -226,6 +1148,45 @@ private void assertWritetimeAndTTL(String column, String where, Long timestamp, } } + private void assertWritetimeAndTTL(String column, String where, List timestamps, List ttls) + throws Throwable + { + where = where == null ? "" : " WHERE " + where; + + // Verify write time + assertRows(format("SELECT WRITETIME(%s) FROM %%s %s", column, where), row(timestamps)); + + // Verify max write time + Long maxTimestamp = timestamps.stream().filter(Objects::nonNull).max(Long::compare).orElse(null); + assertRows(format("SELECT MAXWRITETIME(%s) FROM %%s %s", column, where), row(maxTimestamp)); + + // Verify write time and max write time together + assertRows(format("SELECT WRITETIME(%s), MAXWRITETIME(%s) FROM %%s %s", column, column, where), + row(timestamps, maxTimestamp)); + + // Verify ttl + UntypedResultSet rs = execute(format("SELECT TTL(%s) FROM %%s %s", column, where)); + assertRowCount(rs, 1); + UntypedResultSet.Row row = rs.one(); + String ttlColumn = format("ttl(%s)", column); + if (ttls == null) + { + assertFalse(row.has(ttlColumn)); + } + else + { + List actualTTLs = row.getList(ttlColumn, Int32Type.instance); + assertEquals(ttls.size(), actualTTLs.size()); + + for (int i = 0; i < actualTTLs.size(); i++) + { + Integer expectedTTL = ttls.get(i); + Integer actualTTL = actualTTLs.get(i); + assertTTL(expectedTTL, actualTTL); + } + } + } + /** * Since the returned TTL is the remaining seconds since last update, it could be lower than the * specified TTL depending on the test execution time, se we allow up to one-minute difference @@ -248,25 +1209,40 @@ private void assertInvalidPrimaryKeySelection(String column) throws Throwable { assertInvalidThrowMessage("Cannot use selection function writetime on PRIMARY KEY part " + column, InvalidRequestException.class, - String.format("SELECT WRITETIME(%s) FROM %%s", column)); + format("SELECT WRITETIME(%s) FROM %%s", column)); assertInvalidThrowMessage("Cannot use selection function maxwritetime on PRIMARY KEY part " + column, InvalidRequestException.class, - String.format("SELECT MAXWRITETIME(%s) FROM %%s", column)); + format("SELECT MAXWRITETIME(%s) FROM %%s", column)); assertInvalidThrowMessage("Cannot use selection function ttl on PRIMARY KEY part " + column, InvalidRequestException.class, - String.format("SELECT TTL(%s) FROM %%s", column)); + format("SELECT TTL(%s) FROM %%s", column)); } - private void assertInvalidMultiCellSelection(String column, boolean isCollection) throws Throwable + private void assertInvalidListElementSelection(String column, String list) throws Throwable { - String message = format("Cannot use selection function %%s on non-frozen %s %s", - isCollection ? "collection" : "UDT", column); - assertInvalidThrowMessage(format(message, "writetime"), + String message = format("Element selection is only allowed on sets and maps, but %s is a list", list); + assertInvalidThrowMessage(message, + InvalidRequestException.class, + format("SELECT WRITETIME(%s) FROM %%s", column)); + assertInvalidThrowMessage(message, + InvalidRequestException.class, + format("SELECT MAXWRITETIME(%s) FROM %%s", column)); + assertInvalidThrowMessage(message, + InvalidRequestException.class, + format("SELECT TTL(%s) FROM %%s", column)); + } + + private void assertInvalidListSliceSelection(String column, String list) throws Throwable + { + String message = format("Slice selection is only allowed on sets and maps, but %s is a list", list); + assertInvalidThrowMessage(message, + InvalidRequestException.class, + format("SELECT WRITETIME(%s) FROM %%s", column)); + assertInvalidThrowMessage(message, InvalidRequestException.class, - String.format("SELECT WRITETIME(%s) FROM %%s", column)); - execute(format("SELECT MAXWRITETIME(%s) FROM %%s", column)); - assertInvalidThrowMessage(format(message, "ttl"), + format("SELECT MAXWRITETIME(%s) FROM %%s", column)); + assertInvalidThrowMessage(message, InvalidRequestException.class, - String.format("SELECT TTL(%s) FROM %%s", column)); + format("SELECT TTL(%s) FROM %%s", column)); } } diff --git a/test/unit/org/apache/cassandra/db/aggregation/GroupMakerTest.java b/test/unit/org/apache/cassandra/db/aggregation/GroupMakerTest.java index 13fb0df23092..4363d81723cd 100644 --- a/test/unit/org/apache/cassandra/db/aggregation/GroupMakerTest.java +++ b/test/unit/org/apache/cassandra/db/aggregation/GroupMakerTest.java @@ -20,6 +20,7 @@ import java.nio.ByteBuffer; import java.util.ArrayList; import java.util.Arrays; +import java.util.Collections; import org.junit.BeforeClass; import org.junit.Test; @@ -327,6 +328,6 @@ private GroupMaker newSelectorGroupMaker(boolean... reversed) Selector.Factory factory = selectable.newSelectorFactory(table, null, new ArrayList<>(), VariableSpecifications.empty()); Selector selector = factory.newInstance(QueryOptions.DEFAULT); - return GroupMaker.newSelectorGroupMaker(table.comparator, reversed.length, selector); + return GroupMaker.newSelectorGroupMaker(table.comparator, reversed.length, selector, Collections.singletonList(column)); } } diff --git a/test/unit/org/apache/cassandra/serializers/MapSerializerTest.java b/test/unit/org/apache/cassandra/serializers/MapSerializerTest.java new file mode 100644 index 000000000000..b012123615ea --- /dev/null +++ b/test/unit/org/apache/cassandra/serializers/MapSerializerTest.java @@ -0,0 +1,125 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.serializers; + +import java.nio.ByteBuffer; +import java.util.HashMap; +import java.util.Map; + +import com.google.common.collect.Range; +import org.junit.Test; + +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.MapType; + +import static org.apache.cassandra.utils.ByteBufferUtil.UNSET_BYTE_BUFFER; +import static org.junit.Assert.assertEquals; + +public class MapSerializerTest +{ + @Test + public void testGetIndexFromSerialized() + { + testGetIndexFromSerialized(true); + testGetIndexFromSerialized(false); + } + + private static void testGetIndexFromSerialized(boolean isMultiCell) + { + MapType type = MapType.getInstance(Int32Type.instance, Int32Type.instance, isMultiCell); + AbstractType nameType = type.nameComparator(); + MapSerializer serializer = type.getSerializer(); + + Map map = new HashMap<>(4); + map.put(1, 10); + map.put(3, 30); + map.put(4, 40); + map.put(6, 60); + ByteBuffer bb = type.decompose(map); + + assertEquals(-1, serializer.getIndexFromSerialized(bb, nameType.decompose(0), nameType)); + assertEquals(0, serializer.getIndexFromSerialized(bb, nameType.decompose(1), nameType)); + assertEquals(-1, serializer.getIndexFromSerialized(bb, nameType.decompose(2), nameType)); + assertEquals(1, serializer.getIndexFromSerialized(bb, nameType.decompose(3), nameType)); + assertEquals(2, serializer.getIndexFromSerialized(bb, nameType.decompose(4), nameType)); + assertEquals(-1, serializer.getIndexFromSerialized(bb, nameType.decompose(5), nameType)); + assertEquals(3, serializer.getIndexFromSerialized(bb, nameType.decompose(6), nameType)); + assertEquals(-1, serializer.getIndexFromSerialized(bb, nameType.decompose(7), nameType)); + + assertEquals(Range.closed(0, Integer.MAX_VALUE), serializer.getIndexesRangeFromSerialized(bb, UNSET_BYTE_BUFFER, UNSET_BYTE_BUFFER, nameType)); + + assertEquals(Range.closedOpen(0, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(0), UNSET_BYTE_BUFFER, nameType)); + assertEquals(Range.closedOpen(0, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(1), UNSET_BYTE_BUFFER, nameType)); + assertEquals(Range.closedOpen(1, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(2), UNSET_BYTE_BUFFER, nameType)); + assertEquals(Range.closedOpen(1, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(3), UNSET_BYTE_BUFFER, nameType)); + assertEquals(Range.closedOpen(2, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(4), UNSET_BYTE_BUFFER, nameType)); + assertEquals(Range.closedOpen(3, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(5), UNSET_BYTE_BUFFER, nameType)); + assertEquals(Range.closedOpen(3, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(6), UNSET_BYTE_BUFFER, nameType)); + assertEquals(Range.closedOpen(0, 0), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(7), UNSET_BYTE_BUFFER, nameType)); + + assertEquals(Range.closedOpen(0, 0), serializer.getIndexesRangeFromSerialized(bb, UNSET_BYTE_BUFFER, nameType.decompose(0), nameType)); + assertEquals(Range.closedOpen(0, 1), serializer.getIndexesRangeFromSerialized(bb, UNSET_BYTE_BUFFER, nameType.decompose(1), nameType)); + assertEquals(Range.closedOpen(0, 1), serializer.getIndexesRangeFromSerialized(bb, UNSET_BYTE_BUFFER, nameType.decompose(2), nameType)); + assertEquals(Range.closedOpen(0, 2), serializer.getIndexesRangeFromSerialized(bb, UNSET_BYTE_BUFFER, nameType.decompose(3), nameType)); + assertEquals(Range.closedOpen(0, 3), serializer.getIndexesRangeFromSerialized(bb, UNSET_BYTE_BUFFER, nameType.decompose(4), nameType)); + assertEquals(Range.closedOpen(0, 3), serializer.getIndexesRangeFromSerialized(bb, UNSET_BYTE_BUFFER, nameType.decompose(5), nameType)); + assertEquals(Range.closedOpen(0, 4), serializer.getIndexesRangeFromSerialized(bb, UNSET_BYTE_BUFFER, nameType.decompose(6), nameType)); + assertEquals(Range.closedOpen(0, 4), serializer.getIndexesRangeFromSerialized(bb, UNSET_BYTE_BUFFER, nameType.decompose(7), nameType)); + + assertEquals(Range.closedOpen(0, 0), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(0), nameType.decompose(0), nameType)); + assertEquals(Range.closedOpen(0, 1), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(0), nameType.decompose(1), nameType)); + assertEquals(Range.closedOpen(0, 1), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(0), nameType.decompose(2), nameType)); + assertEquals(Range.closedOpen(0, 2), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(0), nameType.decompose(3), nameType)); + assertEquals(Range.closedOpen(0, 3), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(0), nameType.decompose(4), nameType)); + assertEquals(Range.closedOpen(0, 3), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(0), nameType.decompose(5), nameType)); + assertEquals(Range.closedOpen(0, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(0), nameType.decompose(6), nameType)); + assertEquals(Range.closedOpen(0, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(0), nameType.decompose(7), nameType)); + + assertEquals(Range.closedOpen(0, 0), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(1), nameType.decompose(0), nameType)); + assertEquals(Range.closedOpen(0, 1), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(1), nameType.decompose(1), nameType)); + assertEquals(Range.closedOpen(0, 1), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(1), nameType.decompose(2), nameType)); + assertEquals(Range.closedOpen(0, 2), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(1), nameType.decompose(3), nameType)); + assertEquals(Range.closedOpen(0, 3), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(1), nameType.decompose(4), nameType)); + assertEquals(Range.closedOpen(0, 3), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(1), nameType.decompose(5), nameType)); + assertEquals(Range.closedOpen(0, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(1), nameType.decompose(6), nameType)); + assertEquals(Range.closedOpen(0, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(1), nameType.decompose(7), nameType)); + + assertEquals(Range.closedOpen(1, 1), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(2), nameType.decompose(0), nameType)); + assertEquals(Range.closedOpen(1, 1), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(2), nameType.decompose(1), nameType)); + assertEquals(Range.closedOpen(1, 1), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(2), nameType.decompose(2), nameType)); + assertEquals(Range.closedOpen(1, 2), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(2), nameType.decompose(3), nameType)); + assertEquals(Range.closedOpen(1, 3), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(2), nameType.decompose(4), nameType)); + assertEquals(Range.closedOpen(1, 3), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(2), nameType.decompose(5), nameType)); + assertEquals(Range.closedOpen(1, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(2), nameType.decompose(6), nameType)); + assertEquals(Range.closedOpen(1, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(2), nameType.decompose(7), nameType)); + + assertEquals(Range.closedOpen(0, 0), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(0), nameType.decompose(0), nameType)); + assertEquals(Range.closedOpen(0, 1), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(1), nameType.decompose(1), nameType)); + assertEquals(Range.closedOpen(1, 1), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(2), nameType.decompose(2), nameType)); + assertEquals(Range.closedOpen(1, 2), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(3), nameType.decompose(3), nameType)); + assertEquals(Range.closedOpen(2, 3), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(4), nameType.decompose(4), nameType)); + assertEquals(Range.closedOpen(3, 3), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(5), nameType.decompose(5), nameType)); + assertEquals(Range.closedOpen(3, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(6), nameType.decompose(6), nameType)); + assertEquals(Range.closedOpen(0, 0), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(7), nameType.decompose(7), nameType)); + + // interval with lower bound greater than upper bound + assertEquals(Range.closedOpen(0, 0), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(7), nameType.decompose(0), nameType)); + } +} diff --git a/test/unit/org/apache/cassandra/serializers/SetSerializerTest.java b/test/unit/org/apache/cassandra/serializers/SetSerializerTest.java new file mode 100644 index 000000000000..07522e286d80 --- /dev/null +++ b/test/unit/org/apache/cassandra/serializers/SetSerializerTest.java @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.serializers; + +import java.nio.ByteBuffer; +import java.util.Arrays; +import java.util.HashSet; +import java.util.Set; + +import com.google.common.collect.Range; +import org.junit.Test; + +import org.apache.cassandra.db.marshal.AbstractType; +import org.apache.cassandra.db.marshal.Int32Type; +import org.apache.cassandra.db.marshal.SetType; +import static org.junit.Assert.assertEquals; +import static org.apache.cassandra.utils.ByteBufferUtil.UNSET_BYTE_BUFFER; + +public class SetSerializerTest +{ + @Test + public void testGetIndexFromSerialized() + { + testGetIndexFromSerialized(true); + testGetIndexFromSerialized(false); + } + + private static void testGetIndexFromSerialized(boolean isMultiCell) + { + SetType type = SetType.getInstance(Int32Type.instance, isMultiCell); + AbstractType nameType = type.nameComparator(); + SetSerializer serializer = type.getSerializer(); + + Set set = new HashSet<>(Arrays.asList(1, 3, 4, 6)); + ByteBuffer bb = type.decompose(set); + + assertEquals(-1, serializer.getIndexFromSerialized(bb, nameType.decompose(0), nameType)); + assertEquals(0, serializer.getIndexFromSerialized(bb, nameType.decompose(1), nameType)); + assertEquals(-1, serializer.getIndexFromSerialized(bb, nameType.decompose(2), nameType)); + assertEquals(1, serializer.getIndexFromSerialized(bb, nameType.decompose(3), nameType)); + assertEquals(2, serializer.getIndexFromSerialized(bb, nameType.decompose(4), nameType)); + assertEquals(-1, serializer.getIndexFromSerialized(bb, nameType.decompose(5), nameType)); + assertEquals(3, serializer.getIndexFromSerialized(bb, nameType.decompose(6), nameType)); + assertEquals(-1, serializer.getIndexFromSerialized(bb, nameType.decompose(7), nameType)); + + assertEquals(Range.closed(0, Integer.MAX_VALUE), serializer.getIndexesRangeFromSerialized(bb, UNSET_BYTE_BUFFER, UNSET_BYTE_BUFFER, nameType)); + + assertEquals(Range.closedOpen(0, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(0), UNSET_BYTE_BUFFER, nameType)); + assertEquals(Range.closedOpen(0, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(1), UNSET_BYTE_BUFFER, nameType)); + assertEquals(Range.closedOpen(1, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(2), UNSET_BYTE_BUFFER, nameType)); + assertEquals(Range.closedOpen(1, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(3), UNSET_BYTE_BUFFER, nameType)); + assertEquals(Range.closedOpen(2, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(4), UNSET_BYTE_BUFFER, nameType)); + assertEquals(Range.closedOpen(3, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(5), UNSET_BYTE_BUFFER, nameType)); + assertEquals(Range.closedOpen(3, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(6), UNSET_BYTE_BUFFER, nameType)); + assertEquals(Range.closedOpen(0, 0), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(7), UNSET_BYTE_BUFFER, nameType)); + + assertEquals(Range.closedOpen(0, 0), serializer.getIndexesRangeFromSerialized(bb, UNSET_BYTE_BUFFER, nameType.decompose(0), nameType)); + assertEquals(Range.closedOpen(0, 1), serializer.getIndexesRangeFromSerialized(bb, UNSET_BYTE_BUFFER, nameType.decompose(1), nameType)); + assertEquals(Range.closedOpen(0, 1), serializer.getIndexesRangeFromSerialized(bb, UNSET_BYTE_BUFFER, nameType.decompose(2), nameType)); + assertEquals(Range.closedOpen(0, 2), serializer.getIndexesRangeFromSerialized(bb, UNSET_BYTE_BUFFER, nameType.decompose(3), nameType)); + assertEquals(Range.closedOpen(0, 3), serializer.getIndexesRangeFromSerialized(bb, UNSET_BYTE_BUFFER, nameType.decompose(4), nameType)); + assertEquals(Range.closedOpen(0, 3), serializer.getIndexesRangeFromSerialized(bb, UNSET_BYTE_BUFFER, nameType.decompose(5), nameType)); + assertEquals(Range.closedOpen(0, 4), serializer.getIndexesRangeFromSerialized(bb, UNSET_BYTE_BUFFER, nameType.decompose(6), nameType)); + assertEquals(Range.closedOpen(0, 4), serializer.getIndexesRangeFromSerialized(bb, UNSET_BYTE_BUFFER, nameType.decompose(7), nameType)); + + assertEquals(Range.closedOpen(0, 0), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(0), nameType.decompose(0), nameType)); + assertEquals(Range.closedOpen(0, 1), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(0), nameType.decompose(1), nameType)); + assertEquals(Range.closedOpen(0, 1), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(0), nameType.decompose(2), nameType)); + assertEquals(Range.closedOpen(0, 2), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(0), nameType.decompose(3), nameType)); + assertEquals(Range.closedOpen(0, 3), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(0), nameType.decompose(4), nameType)); + assertEquals(Range.closedOpen(0, 3), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(0), nameType.decompose(5), nameType)); + assertEquals(Range.closedOpen(0, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(0), nameType.decompose(6), nameType)); + assertEquals(Range.closedOpen(0, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(0), nameType.decompose(7), nameType)); + + assertEquals(Range.closedOpen(0, 0), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(1), nameType.decompose(0), nameType)); + assertEquals(Range.closedOpen(0, 1), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(1), nameType.decompose(1), nameType)); + assertEquals(Range.closedOpen(0, 1), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(1), nameType.decompose(2), nameType)); + assertEquals(Range.closedOpen(0, 2), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(1), nameType.decompose(3), nameType)); + assertEquals(Range.closedOpen(0, 3), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(1), nameType.decompose(4), nameType)); + assertEquals(Range.closedOpen(0, 3), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(1), nameType.decompose(5), nameType)); + assertEquals(Range.closedOpen(0, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(1), nameType.decompose(6), nameType)); + assertEquals(Range.closedOpen(0, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(1), nameType.decompose(7), nameType)); + + assertEquals(Range.closedOpen(1, 1), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(2), nameType.decompose(0), nameType)); + assertEquals(Range.closedOpen(1, 1), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(2), nameType.decompose(1), nameType)); + assertEquals(Range.closedOpen(1, 1), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(2), nameType.decompose(2), nameType)); + assertEquals(Range.closedOpen(1, 2), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(2), nameType.decompose(3), nameType)); + assertEquals(Range.closedOpen(1, 3), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(2), nameType.decompose(4), nameType)); + assertEquals(Range.closedOpen(1, 3), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(2), nameType.decompose(5), nameType)); + assertEquals(Range.closedOpen(1, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(2), nameType.decompose(6), nameType)); + assertEquals(Range.closedOpen(1, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(2), nameType.decompose(7), nameType)); + + assertEquals(Range.closedOpen(0, 0), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(0), nameType.decompose(0), nameType)); + assertEquals(Range.closedOpen(0, 1), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(1), nameType.decompose(1), nameType)); + assertEquals(Range.closedOpen(1, 1), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(2), nameType.decompose(2), nameType)); + assertEquals(Range.closedOpen(1, 2), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(3), nameType.decompose(3), nameType)); + assertEquals(Range.closedOpen(2, 3), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(4), nameType.decompose(4), nameType)); + assertEquals(Range.closedOpen(3, 3), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(5), nameType.decompose(5), nameType)); + assertEquals(Range.closedOpen(3, 4), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(6), nameType.decompose(6), nameType)); + assertEquals(Range.closedOpen(0, 0), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(7), nameType.decompose(7), nameType)); + + // interval with lower bound greater than upper bound + assertEquals(Range.closedOpen(0, 0), serializer.getIndexesRangeFromSerialized(bb, nameType.decompose(7), nameType.decompose(0), nameType)); + } +} From 60db95cba10d3ba0f8b1dcf377b42571cd2101ea Mon Sep 17 00:00:00 2001 From: David Capwell Date: Fri, 12 Aug 2022 15:20:37 -0700 Subject: [PATCH 041/159] DataOutputBuffer#scratchBuffer can use off-heap or on-heap memory as a means to control memory allocations patch by David Capwell; reviewed by Caleb Rackliffe for CASSANDRA-16471 --- CHANGES.txt | 1 + .../config/CassandraRelevantProperties.java | 38 +++++++++++++++- .../cassandra/db/commitlog/CommitLog.java | 2 +- .../db/rows/UnfilteredSerializer.java | 2 +- .../apache/cassandra/hints/HintsWriter.java | 2 +- .../io/util/BufferedDataOutputStreamPlus.java | 10 +++++ .../cassandra/io/util/DataOutputBuffer.java | 43 ++++++++++++++++--- 7 files changed, 88 insertions(+), 10 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index b7cadc35f6da..6097abe8977f 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * DataOutputBuffer#scratchBuffer can use off-heap or on-heap memory as a means to control memory allocations (CASSANDRA-16471) * Add ability to read the TTLs and write times of the elements of a collection and/or UDT (CASSANDRA-8877) * Removed Python < 2.7 support from formatting.py (CASSANDRA-17694) * Cleanup pylint issues with pylexotron.py (CASSANDRA-17779) diff --git a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java index 6eea3239765e..00c2f4cd28be 100644 --- a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java +++ b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java @@ -289,10 +289,12 @@ public enum CassandraRelevantProperties /** property for the interval on which the repeated client warnings and diagnostic events about disk usage are ignored */ DISK_USAGE_NOTIFY_INTERVAL_MS("cassandra.disk_usage.notify_interval_ms", Long.toString(TimeUnit.MINUTES.toMillis(30))), + /** Controls the type of bufffer (heap/direct) used for shared scratch buffers */ + DATA_OUTPUT_BUFFER_ALLOCATE_TYPE("cassandra.dob.allocate_type"), + // for specific tests ORG_APACHE_CASSANDRA_CONF_CASSANDRA_RELEVANT_PROPERTIES_TEST("org.apache.cassandra.conf.CassandraRelevantPropertiesTest"), ORG_APACHE_CASSANDRA_DB_VIRTUAL_SYSTEM_PROPERTIES_TABLE_TEST("org.apache.cassandra.db.virtual.SystemPropertiesTableTest"), - ; @@ -454,6 +456,40 @@ public void setLong(long value) System.setProperty(key, Long.toString(value)); } + /** + * Gets the value of a system property as a enum, calling {@link String#toUpperCase()} first. + * + * @param defaultValue to return when not defined + * @param type + * @return enum value + */ + public > T getEnum(T defaultValue) { + return getEnum(true, defaultValue); + } + + /** + * Gets the value of a system property as a enum, optionally calling {@link String#toUpperCase()} first. + * + * @param toUppercase before converting to enum + * @param defaultValue to return when not defined + * @param type + * @return enum value + */ + public > T getEnum(boolean toUppercase, T defaultValue) { + String value = System.getProperty(key); + if (value == null) + return defaultValue; + return Enum.valueOf(defaultValue.getDeclaringClass(), toUppercase ? value.toUpperCase() : value); + } + + /** + * Sets the value into system properties. + * @param value to set + */ + public void setEnum(Enum value) { + System.setProperty(key, value.name()); + } + public interface PropertyConverter { T convert(String value); diff --git a/src/java/org/apache/cassandra/db/commitlog/CommitLog.java b/src/java/org/apache/cassandra/db/commitlog/CommitLog.java index eb94519127f8..aff6a3615c44 100644 --- a/src/java/org/apache/cassandra/db/commitlog/CommitLog.java +++ b/src/java/org/apache/cassandra/db/commitlog/CommitLog.java @@ -291,7 +291,7 @@ public CommitLogPosition add(Mutation mutation) throws CDCWriteException buffer.putInt((int) checksum.getValue()); // checksummed mutation - dos.write(dob.getData(), 0, size); + dos.write(dob.unsafeGetBufferAndFlip()); updateChecksum(checksum, buffer, buffer.position() - size, size); buffer.putInt((int) checksum.getValue()); } diff --git a/src/java/org/apache/cassandra/db/rows/UnfilteredSerializer.java b/src/java/org/apache/cassandra/db/rows/UnfilteredSerializer.java index 1c0dcd498b30..5bdfb0dfb88b 100644 --- a/src/java/org/apache/cassandra/db/rows/UnfilteredSerializer.java +++ b/src/java/org/apache/cassandra/db/rows/UnfilteredSerializer.java @@ -196,7 +196,7 @@ private void serialize(Row row, SerializationHelper helper, DataOutputPlus out, // We write the size of the previous unfiltered to make reverse queries more efficient (and simpler). // This is currently not used however and using it is tbd. out.writeUnsignedVInt(previousUnfilteredSize); - out.write(dob.getData(), 0, dob.getLength()); + out.write(dob.unsafeGetBufferAndFlip()); } } else diff --git a/src/java/org/apache/cassandra/hints/HintsWriter.java b/src/java/org/apache/cassandra/hints/HintsWriter.java index 663427a51b51..5ff2dfd7ea38 100644 --- a/src/java/org/apache/cassandra/hints/HintsWriter.java +++ b/src/java/org/apache/cassandra/hints/HintsWriter.java @@ -78,7 +78,7 @@ static HintsWriter create(File directory, HintsDescriptor descriptor) throws IOE { // write the descriptor descriptor.serialize(dob); - ByteBuffer descriptorBytes = dob.buffer(); + ByteBuffer descriptorBytes = dob.unsafeGetBufferAndFlip(); updateChecksum(crc, descriptorBytes); channel.write(descriptorBytes); diff --git a/src/java/org/apache/cassandra/io/util/BufferedDataOutputStreamPlus.java b/src/java/org/apache/cassandra/io/util/BufferedDataOutputStreamPlus.java index 4e9bbb580f7c..dba89b07f2e3 100644 --- a/src/java/org/apache/cassandra/io/util/BufferedDataOutputStreamPlus.java +++ b/src/java/org/apache/cassandra/io/util/BufferedDataOutputStreamPlus.java @@ -66,6 +66,16 @@ protected BufferedDataOutputStreamPlus(ByteBuffer buffer) this.buffer = buffer; } + protected BufferedDataOutputStreamPlus(int size) + { + this.buffer = allocate(size); + } + + protected ByteBuffer allocate(int size) + { + return ByteBuffer.allocate(size); + } + @Override public void write(byte[] b) throws IOException { diff --git a/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java b/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java index d6f3a4a0e8d7..ee669fa5a636 100644 --- a/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java +++ b/src/java/org/apache/cassandra/io/util/DataOutputBuffer.java @@ -28,6 +28,8 @@ import io.netty.util.concurrent.FastThreadLocal; import org.apache.cassandra.config.Config; +import static org.apache.cassandra.config.CassandraRelevantProperties.DATA_OUTPUT_BUFFER_ALLOCATE_TYPE; + /** * An implementation of the DataOutputStream interface using a FastByteArrayOutputStream and exposing * its buffer so copies can be avoided. @@ -45,11 +47,13 @@ public class DataOutputBuffer extends BufferedDataOutputStreamPlus * Only recycle OutputBuffers up to 1Mb. Larger buffers will be trimmed back to this size. */ private static final int MAX_RECYCLE_BUFFER_SIZE = Integer.getInteger(Config.PROPERTY_PREFIX + "dob_max_recycle_bytes", 1024 * 1024); + private enum AllocationType { DIRECT, ONHEAP } + private static final AllocationType ALLOCATION_TYPE = DATA_OUTPUT_BUFFER_ALLOCATE_TYPE.getEnum(AllocationType.DIRECT); private static final int DEFAULT_INITIAL_BUFFER_SIZE = 128; /** - * Scratch buffers used mostly for serializing in memory. It's important to call #recycle() when finished + * Scratch buffers used mostly for serializing in memory. It's important to call #close() when finished * to keep the memory overhead from being too large in the system. */ public static final FastThreadLocal scratchBuffer = new FastThreadLocal() @@ -59,29 +63,38 @@ protected DataOutputBuffer initialValue() { return new DataOutputBuffer() { + @Override public void close() { - if (buffer.capacity() <= MAX_RECYCLE_BUFFER_SIZE) + if (buffer != null && buffer.capacity() <= MAX_RECYCLE_BUFFER_SIZE) { buffer.clear(); } else { - buffer = ByteBuffer.allocate(DEFAULT_INITIAL_BUFFER_SIZE); + setBuffer(allocate(DEFAULT_INITIAL_BUFFER_SIZE)); } } + + @Override + protected ByteBuffer allocate(int size) + { + return ALLOCATION_TYPE == AllocationType.DIRECT ? + ByteBuffer.allocateDirect(size) : + ByteBuffer.allocate(size); + } }; } }; public DataOutputBuffer() { - this(DEFAULT_INITIAL_BUFFER_SIZE); + super(DEFAULT_INITIAL_BUFFER_SIZE); } public DataOutputBuffer(int size) { - super(ByteBuffer.allocate(size)); + super(size); } public DataOutputBuffer(ByteBuffer buffer) @@ -158,9 +171,15 @@ protected void expandToFit(long count) { if (count <= 0) return; - ByteBuffer newBuffer = ByteBuffer.allocate(checkedArraySizeCast(calculateNewSize(count))); + ByteBuffer newBuffer = allocate(checkedArraySizeCast(calculateNewSize(count))); buffer.flip(); newBuffer.put(buffer); + setBuffer(newBuffer); + } + + protected void setBuffer(ByteBuffer newBuffer) + { + FileUtils.clean(buffer); // free if direct buffer = newBuffer; } @@ -221,6 +240,18 @@ public ByteBuffer buffer(boolean duplicate) return result; } + /** + * Gets the underlying ByteBuffer and calls {@link ByteBuffer#flip()}. This method is "unsafe" in the sense that + * it returns the underlying buffer, which may be modified by other methods after calling this method (or cleared on + * {@link #close()}). If the calling logic knows that no new calls to this object will happen after calling this + * method, then this method can avoid the copying done in {@link #asNewBuffer()}, and {@link #buffer()}. + */ + public ByteBuffer unsafeGetBufferAndFlip() + { + buffer.flip(); + return buffer; + } + public byte[] getData() { assert buffer.arrayOffset() == 0; From 09b282d1fdd7d6d62542137003011d144c0227be Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Thu, 11 Aug 2022 14:02:27 -0400 Subject: [PATCH 042/159] Rate-limit new client connection auth setup to avoid overwhelming bcrypt Patch by Chris Lohfink; reviewed by Caleb Rackliffe, Yifan Cai, and Josh McKenzie for CASSANDRA-17812 Co-authored-by: Chris Lohfink Co-authored-by: Josh McKenzie --- CHANGES.txt | 1 + .../org/apache/cassandra/config/Config.java | 2 + .../cassandra/config/DatabaseDescriptor.java | 16 ++ .../cassandra/transport/Dispatcher.java | 47 +++-- .../org/apache/cassandra/utils/Shared.java | 2 +- .../transport/MessageDispatcherTest.java | 172 ++++++++++++++++++ 6 files changed, 228 insertions(+), 12 deletions(-) create mode 100644 test/unit/org/apache/cassandra/transport/MessageDispatcherTest.java diff --git a/CHANGES.txt b/CHANGES.txt index 489d2d88457e..3aaaf8b38e6a 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Rate-limit new client connection auth setup to avoid overwhelming bcrypt (CASSANDRA-17812) * DataOutputBuffer#scratchBuffer can use off-heap or on-heap memory as a means to control memory allocations (CASSANDRA-16471) * Add ability to read the TTLs and write times of the elements of a collection and/or UDT (CASSANDRA-8877) * Removed Python < 2.7 support from formatting.py (CASSANDRA-17694) diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java index 68091ac90f27..bdca0a7df1cb 100644 --- a/src/java/org/apache/cassandra/config/Config.java +++ b/src/java/org/apache/cassandra/config/Config.java @@ -266,6 +266,8 @@ public MemtableOptions() public int native_transport_max_threads = 128; @Replaces(oldName = "native_transport_max_frame_size_in_mb", converter = Converters.MEBIBYTES_DATA_STORAGE_INT, deprecated = true) public DataStorageSpec.IntMebibytesBound native_transport_max_frame_size = new DataStorageSpec.IntMebibytesBound("16MiB"); + /** do bcrypt hashing in a limited pool to prevent cpu load spikes; note: any value < 1 will be set to 1 on init **/ + public int native_transport_max_auth_threads = 4; public volatile long native_transport_max_concurrent_connections = -1L; public volatile long native_transport_max_concurrent_connections_per_ip = -1L; public boolean native_transport_flush_in_batches_legacy = false; diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java index 1ce16052feae..0a9036f6329a 100644 --- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java +++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java @@ -2573,6 +2573,22 @@ public static void setNativeTransportMaxThreads(int max_threads) conf.native_transport_max_threads = max_threads; } + public static Integer getNativeTransportMaxAuthThreads() + { + return conf.native_transport_max_auth_threads; + } + + /** + * If this value is set to <= 0 it will move auth requests to the standard request pool regardless of the current + * size of the {@link org.apache.cassandra.transport.Dispatcher#authExecutor}'s active size. + * + * see {@link org.apache.cassandra.transport.Dispatcher#dispatch} for executor selection + */ + public static void setNativeTransportMaxAuthThreads(int threads) + { + conf.native_transport_max_auth_threads = threads; + } + public static int getNativeTransportMaxFrameSize() { return conf.native_transport_max_frame_size.toBytes(); diff --git a/src/java/org/apache/cassandra/transport/Dispatcher.java b/src/java/org/apache/cassandra/transport/Dispatcher.java index 8f8a607c777b..f21acc2c6d17 100644 --- a/src/java/org/apache/cassandra/transport/Dispatcher.java +++ b/src/java/org/apache/cassandra/transport/Dispatcher.java @@ -23,6 +23,7 @@ import java.util.concurrent.TimeUnit; import java.util.function.Consumer; +import com.google.common.annotations.VisibleForTesting; import com.google.common.base.Predicate; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -51,11 +52,31 @@ public class Dispatcher { private static final Logger logger = LoggerFactory.getLogger(Dispatcher.class); - - private static final LocalAwareExecutorPlus requestExecutor = SHARED.newExecutor(DatabaseDescriptor.getNativeTransportMaxThreads(), - DatabaseDescriptor::setNativeTransportMaxThreads, - "transport", - "Native-Transport-Requests"); + + @VisibleForTesting + static final LocalAwareExecutorPlus requestExecutor = SHARED.newExecutor(DatabaseDescriptor.getNativeTransportMaxThreads(), + DatabaseDescriptor::setNativeTransportMaxThreads, + "transport", + "Native-Transport-Requests"); + + /** CASSANDRA-17812: Rate-limit new client connection setup to avoid overwhelming during bcrypt + * + * authExecutor is a separate thread pool for handling requests on connections that need to be authenticated. + * Calls to AUTHENTICATE can be expensive if the number of rounds for bcrypt is configured to a high value, + * so during a connection storm checking the password hash would starve existing connected clients for CPU and + * trigger timeouts if on the same thread pool as standard requests. + * + * Moving authentication requests to a small, separate pool prevents starvation handling all other + * requests. If the authExecutor pool backs up, it may cause authentication timeouts but the clients should + * back off and retry while the rest of the system continues to make progress. + * + * Setting less than 1 will service auth requests on the standard {@link Dispatcher#requestExecutor} + */ + @VisibleForTesting + static final LocalAwareExecutorPlus authExecutor = SHARED.newExecutor(Math.max(1, DatabaseDescriptor.getNativeTransportMaxAuthThreads()), + DatabaseDescriptor::setNativeTransportMaxAuthThreads, + "transport", + "Native-Transport-Auth-Requests"); private static final ConcurrentMap flusherLookup = new ConcurrentHashMap<>(); private final boolean useLegacyFlusher; @@ -80,7 +101,14 @@ public Dispatcher(boolean useLegacyFlusher) public void dispatch(Channel channel, Message.Request request, FlushItemConverter forFlusher, Overload backpressure) { - requestExecutor.submit(new RequestProcessor(channel, request, forFlusher, backpressure)); + // if native_transport_max_auth_threads is < 1, don't delegate to new pool on auth messages + boolean isAuthQuery = DatabaseDescriptor.getNativeTransportMaxAuthThreads() > 0 && + (request.type == Message.Type.AUTH_RESPONSE || request.type == Message.Type.CREDENTIALS); + + // Importantly, the authExecutor will handle the AUTHENTICATE message which may be CPU intensive. + LocalAwareExecutorPlus executor = isAuthQuery ? authExecutor : requestExecutor; + + executor.submit(new RequestProcessor(channel, request, forFlusher, backpressure)); ClientMetrics.instance.markRequestDispatched(); } @@ -233,13 +261,10 @@ private void flush(FlushItem item) public static void shutdown() { - if (requestExecutor != null) - { - requestExecutor.shutdown(); - } + requestExecutor.shutdown(); + authExecutor.shutdown(); } - /** * Dispatcher for EventMessages. In {@link Server.ConnectionTracker#send(Event)}, the strategy * for delivering events to registered clients is dependent on protocol version and the configuration diff --git a/src/java/org/apache/cassandra/utils/Shared.java b/src/java/org/apache/cassandra/utils/Shared.java index e576c8676c76..64336249113e 100644 --- a/src/java/org/apache/cassandra/utils/Shared.java +++ b/src/java/org/apache/cassandra/utils/Shared.java @@ -24,7 +24,7 @@ import java.lang.annotation.Target; /** - * Tells jvm-dtest that a class should be shared accross all {@link ClassLoader}s. + * Tells jvm-dtest that a class should be shared across all {@link ClassLoader}s. * * Jvm-dtest relies on classloader isolation to run multiple cassandra instances in the same JVM, this makes it * so some classes do not get shared (outside a blesssed set of classes/packages). When the default behavior diff --git a/test/unit/org/apache/cassandra/transport/MessageDispatcherTest.java b/test/unit/org/apache/cassandra/transport/MessageDispatcherTest.java new file mode 100644 index 000000000000..0c70315e257b --- /dev/null +++ b/test/unit/org/apache/cassandra/transport/MessageDispatcherTest.java @@ -0,0 +1,172 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.transport; + +import java.util.Collections; +import java.util.concurrent.Callable; +import java.util.concurrent.TimeUnit; + +import com.google.common.util.concurrent.Uninterruptibles; +import org.junit.AfterClass; +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +import io.netty.channel.Channel; +import org.apache.cassandra.config.DatabaseDescriptor; +import org.apache.cassandra.metrics.ClientMetrics; +import org.apache.cassandra.service.QueryState; +import org.apache.cassandra.transport.messages.AuthResponse; + +public class MessageDispatcherTest +{ + static final Message.Request AUTH_RESPONSE_REQUEST = new AuthResponse(new byte[0]) + { + public Response execute(QueryState queryState, long queryStartNanoTime, boolean traceRequest) + { + return null; + } + }; + + private static AuthTestDispatcher dispatch; + private static int maxAuthThreadsBeforeTests; + + @BeforeClass + public static void init() throws Exception + { + DatabaseDescriptor.daemonInitialization(); + ClientMetrics.instance.init(Collections.emptyList()); + maxAuthThreadsBeforeTests = DatabaseDescriptor.getNativeTransportMaxAuthThreads(); + dispatch = new AuthTestDispatcher(); + } + + @AfterClass + public static void restoreAuthSize() + { + DatabaseDescriptor.setNativeTransportMaxAuthThreads(maxAuthThreadsBeforeTests); + } + + @Test + public void testAuthRateLimiter() throws Exception + { + long startRequests = completedRequests(); + + DatabaseDescriptor.setNativeTransportMaxAuthThreads(1); + long auths = tryAuth(this::completedAuth); + Assert.assertEquals(auths, 1); + + DatabaseDescriptor.setNativeTransportMaxAuthThreads(100); + auths = tryAuth(this::completedAuth); + Assert.assertEquals(auths, 1); + + // Make sure no tasks executed on the regular pool + Assert.assertEquals(startRequests, completedRequests()); + } + + @Test + public void testAuthRateLimiterNotUsed() throws Exception + { + DatabaseDescriptor.setNativeTransportMaxAuthThreads(1); + for (Message.Type type : Message.Type.values()) + { + if (type == Message.Type.AUTH_RESPONSE || type == Message.Type.CREDENTIALS || type.direction != Message.Direction.REQUEST) + continue; + + long auths = completedAuth(); + long requests = tryAuth(this::completedRequests, new Message.Request(type) + { + public Response execute(QueryState queryState, long queryStartNanoTime, boolean traceRequest) + { + return null; + } + }); + Assert.assertEquals(requests, 1); + Assert.assertEquals(completedAuth() - auths, 0); + } + } + + @Test + public void testAuthRateLimiterDisabled() throws Exception + { + long startAuthRequests = completedAuth(); + + DatabaseDescriptor.setNativeTransportMaxAuthThreads(0); + long requests = tryAuth(this::completedRequests); + Assert.assertEquals(requests, 1); + + DatabaseDescriptor.setNativeTransportMaxAuthThreads(-1); + requests = tryAuth(this::completedRequests); + Assert.assertEquals(requests, 1); + + DatabaseDescriptor.setNativeTransportMaxAuthThreads(-1000); + requests = tryAuth(this::completedRequests); + Assert.assertEquals(requests, 1); + + // Make sure no tasks executed on the auth pool + Assert.assertEquals(startAuthRequests, completedAuth()); + } + + private long completedRequests() + { + return Dispatcher.requestExecutor.getCompletedTaskCount(); + } + + private long completedAuth() + { + return Dispatcher.authExecutor.getCompletedTaskCount(); + } + + public long tryAuth(Callable check) throws Exception + { + return tryAuth(check, AUTH_RESPONSE_REQUEST); + } + + @SuppressWarnings("UnstableApiUsage") + public long tryAuth(Callable check, Message.Request request) throws Exception + { + long start = check.call(); + dispatch.dispatch(null, request, (channel,req,response) -> null, ClientResourceLimits.Overload.NONE); + + // While this is timeout based, we should be *well below* a full second on any of this processing in any sane environment. + long timeout = System.currentTimeMillis(); + while(start == check.call() && System.currentTimeMillis() - timeout < 1000) + { + Uninterruptibles.sleepUninterruptibly(10, TimeUnit.MILLISECONDS); + } + return check.call() - start; + } + + public static class AuthTestDispatcher extends Dispatcher + { + public AuthTestDispatcher() + { + super(false); + } + + @Override + void processRequest(Channel channel, + Message.Request request, + FlushItemConverter forFlusher, + ClientResourceLimits.Overload backpressure, + long approxStartTimeNanos) + { + // noop + } + } +} From 3e0b94565acc64e903d73af3a14b23c875abc5b3 Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Tue, 2 Aug 2022 16:02:03 -0400 Subject: [PATCH 043/159] Add support to generate a One-Shot heap dump on unhandled exceptions Patch by Caleb Rackliffe; reviewed by Josh McKenzie, David Capwell, and Jon Meredith for CASSANDRA-17795 Co-authored-by: Caleb Rackliffe Co-authored-by: Josh McKenzie --- CHANGES.txt | 1 + build.xml | 1 + conf/cassandra.yaml | 10 +++ ide/idea/workspace.xml | 6 +- .../org/apache/cassandra/config/Config.java | 3 + .../cassandra/config/DatabaseDescriptor.java | 79 ++++++++++++++++++ .../cassandra/service/StorageProxy.java | 12 +++ .../cassandra/service/StorageProxyMBean.java | 3 + .../org/apache/cassandra/utils/HeapUtils.java | 82 ++++++++++++++++++- .../utils/JVMStabilityInspector.java | 3 + test/conf/cassandra.yaml | 3 + .../apache/cassandra/utils/HeapUtilsTest.java | 63 ++++++++++++++ .../cassandra/tools/BulkLoaderTest.java | 26 +++--- 13 files changed, 277 insertions(+), 15 deletions(-) create mode 100644 test/long/org/apache/cassandra/utils/HeapUtilsTest.java diff --git a/CHANGES.txt b/CHANGES.txt index 3aaaf8b38e6a..dd09b25c564f 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Add support to generate a One-Shot heap dump on unhandled exceptions (CASSANDRA-17795) * Rate-limit new client connection auth setup to avoid overwhelming bcrypt (CASSANDRA-17812) * DataOutputBuffer#scratchBuffer can use off-heap or on-heap memory as a means to control memory allocations (CASSANDRA-16471) * Add ability to read the TTLs and write times of the elements of a collection and/or UDT (CASSANDRA-8877) diff --git a/build.xml b/build.xml index ca346c9f28c9..82a805d5d87c 100644 --- a/build.xml +++ b/build.xml @@ -1500,6 +1500,7 @@ more aggressively rather than waiting. See CASSANDRA-14922 for more details. --> + diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml index 98d70a035fe2..21e3f78c10b9 100644 --- a/conf/cassandra.yaml +++ b/conf/cassandra.yaml @@ -107,6 +107,16 @@ auto_hints_cleanup_enabled: false # parameters: # - +# Directory where Cassandra should store results of a One-Shot troubleshooting heapdump for uncaught exceptions. +# Note: this value can be overridden by the -XX:HeapDumpPath JVM env param with a relative local path for testing if +# so desired. +# If not set, the default directory is $CASSANDRA_HOME/heapdump +# heap_dump_path: /var/lib/cassandra/heapdump + +# Enable / disable automatic dump of heap on first uncaught exception +# If not set, the default value is false +# dump_heap_on_uncaught_exception: true + # Enable / disable persistent hint windows. # # If set to false, a hint will be stored only in case a respective node diff --git a/ide/idea/workspace.xml b/ide/idea/workspace.xml index e35ba90ac7b4..321edd8024dd 100644 --- a/ide/idea/workspace.xml +++ b/ide/idea/workspace.xml @@ -143,7 +143,7 @@

    add("x", 10); and add("x", 20); will result in "x" = 30

    This uses StreamSummary to only store the * approximate cardinality (capacity) of keys. If the number of distinct keys exceed the capacity, the error of the * sample may increase depending on distribution of keys among the total set. + * + * Note: {@link Sampler#samplerExecutor} is single threaded but we still need to synchronize as we have access + * from both internal and the external JMX context that can cause races. * * @param */ public abstract class FrequencySampler extends Sampler { private static final Logger logger = LoggerFactory.getLogger(FrequencySampler.class); - private long endTimeNanos = -1; private StreamSummary summary; /** * Start to record samples * - * @param capacity - * Number of sample items to keep in memory, the lower this is - * the less accurate results are. For best results use value - * close to cardinality, but understand the memory trade offs. + * @param capacity Number of sample items to keep in memory, the lower this is + * the less accurate results are. For best results use value + * close to cardinality, but understand the memory trade offs. */ - public synchronized void beginSampling(int capacity, int durationMillis) + public synchronized void beginSampling(int capacity, long durationMillis) { - if (endTimeNanos == -1 || clock.now() > endTimeNanos) - { - summary = new StreamSummary<>(capacity); - endTimeNanos = clock.now() + MILLISECONDS.toNanos(durationMillis); - } - else + if (isActive()) throw new RuntimeException("Sampling already in progress"); + updateEndTime(clock.now() + MILLISECONDS.toNanos(durationMillis)); + summary = new StreamSummary<>(capacity); } /** @@ -69,12 +67,12 @@ public synchronized void beginSampling(int capacity, int durationMillis) public synchronized List> finishSampling(int count) { List> results = Collections.emptyList(); - if (endTimeNanos != -1) + if (isEnabled()) { - endTimeNanos = -1; + disable(); results = summary.topK(count) .stream() - .map(c -> new Sample(c.getItem(), c.getCount(), c.getError())) + .map(c -> new Sample<>(c.getItem(), c.getCount(), c.getError())) .collect(Collectors.toList()); } return results; @@ -82,24 +80,16 @@ public synchronized List> finishSampling(int count) protected synchronized void insert(final T item, final long value) { - // samplerExecutor is single threaded but still need - // synchronization against jmx calls to finishSampling - if (value > 0 && clock.now() <= endTimeNanos) + if (value > 0 && isActive()) { try { summary.offer(item, (int) Math.min(value, Integer.MAX_VALUE)); - } catch (Exception e) + } + catch (Exception e) { logger.trace("Failure to offer sample", e); } } } - - public boolean isEnabled() - { - return endTimeNanos != -1 && clock.now() <= endTimeNanos; - } - } - diff --git a/src/java/org/apache/cassandra/metrics/MaxSampler.java b/src/java/org/apache/cassandra/metrics/MaxSampler.java index df24bb96298b..0593e341d6b0 100644 --- a/src/java/org/apache/cassandra/metrics/MaxSampler.java +++ b/src/java/org/apache/cassandra/metrics/MaxSampler.java @@ -26,39 +26,35 @@ import static java.util.concurrent.TimeUnit.MILLISECONDS; +/** + * Note: {@link Sampler#samplerExecutor} is single threaded but we still need to synchronize as we have access + * from both internal and the external JMX context that can cause races. + */ public abstract class MaxSampler extends Sampler { private int capacity; private MinMaxPriorityQueue> queue; - private long endTimeNanos = -1; private final Comparator> comp = Collections.reverseOrder(Comparator.comparing(p -> p.count)); - public boolean isEnabled() - { - return endTimeNanos != -1 && clock.now() <= endTimeNanos; - } - - public synchronized void beginSampling(int capacity, int durationMillis) + @Override + public synchronized void beginSampling(int capacity, long durationMillis) { - if (endTimeNanos == -1 || clock.now() > endTimeNanos) - { - endTimeNanos = clock.now() + MILLISECONDS.toNanos(durationMillis); - queue = MinMaxPriorityQueue - .orderedBy(comp) - .maximumSize(Math.max(1, capacity)) - .create(); - this.capacity = capacity; - } - else + if (isActive()) throw new RuntimeException("Sampling already in progress"); + updateEndTime(clock.now() + MILLISECONDS.toNanos(durationMillis)); + queue = MinMaxPriorityQueue.orderedBy(comp) + .maximumSize(Math.max(1, capacity)) + .create(); + this.capacity = capacity; } + @Override public synchronized List> finishSampling(int count) { List> result = new ArrayList<>(count); - if (endTimeNanos != -1) + if (isEnabled()) { - endTimeNanos = -1; + disable(); Sample next; while ((next = queue.poll()) != null && result.size() <= count) result.add(next); @@ -69,9 +65,12 @@ public synchronized List> finishSampling(int count) @Override protected synchronized void insert(T item, long value) { - if (value > 0 && clock.now() <= endTimeNanos - && (queue.isEmpty() || queue.size() < capacity || queue.peekLast().count < value)) + if (isActive() && permitsValue(value)) queue.add(new Sample(item, value, 0)); } + private boolean permitsValue(long value) + { + return value > 0 && (queue.isEmpty() || queue.size() < capacity || queue.peekLast().count < value); + } } diff --git a/src/java/org/apache/cassandra/metrics/Sampler.java b/src/java/org/apache/cassandra/metrics/Sampler.java index b3d0f21aa5e5..de7f0b2e1a70 100644 --- a/src/java/org/apache/cassandra/metrics/Sampler.java +++ b/src/java/org/apache/cassandra/metrics/Sampler.java @@ -17,10 +17,12 @@ */ package org.apache.cassandra.metrics; +import java.io.PrintStream; import java.io.Serializable; import java.util.List; import java.util.concurrent.TimeUnit; import java.util.concurrent.TimeoutException; +import java.util.function.BiFunction; import org.apache.cassandra.concurrent.ExecutorPlus; import org.apache.cassandra.net.MessagingService; @@ -34,9 +36,44 @@ public abstract class Sampler { + private static long DISABLED = -1L; + + private static final BiFunction + FrequencySamplerFomatter = (type, resultBuilder) -> + resultBuilder.forType(type, type.description) + .addColumn("Table", "table") + .addColumn("Partition", "value") + .addColumn("Count", "count") + .addColumn("+/-", "error"); + public enum SamplerType { - READS, WRITES, LOCAL_READ_TIME, WRITE_SIZE, CAS_CONTENTIONS + READS("Frequency of reads by partition", FrequencySamplerFomatter), + WRITES("Frequency of writes by partition", FrequencySamplerFomatter), + LOCAL_READ_TIME("Longest read query times", ((samplerType, resultBuilder) -> + resultBuilder.forType(samplerType, samplerType.description) + .addColumn("Query", "value") + .addColumn("Microseconds", "count"))), + WRITE_SIZE("Max mutation size by partition", ((samplerType, resultBuilder) -> + resultBuilder.forType(samplerType, samplerType.description) + .addColumn("Table", "table") + .addColumn("Partition", "value") + .addColumn("Bytes", "count"))), + CAS_CONTENTIONS("Frequency of CAS contention by partition", FrequencySamplerFomatter); + + private final String description; + private final BiFunction formatter; + + SamplerType(String description, BiFunction formatter) + { + this.description = description; + this.formatter = formatter; + } + + void format(SamplingManager.ResultBuilder resultBuilder, PrintStream ps) + { + formatter.apply(this, resultBuilder).print(ps); + } } @VisibleForTesting @@ -50,6 +87,8 @@ public enum SamplerType .withRejectedExecutionHandler((runnable, executor) -> MessagingService.instance().metrics.recordSelfDroppedMessage(Verb._SAMPLE)) .build(); + private long endTimeNanos = -1; + public void addSample(final T item, final int value) { if (isEnabled()) @@ -58,10 +97,53 @@ public void addSample(final T item, final int value) protected abstract void insert(T item, long value); - public abstract boolean isEnabled(); + /** + * A sampler is enabled between {@link this#beginSampling} and {@link this#finishSampling} + * @return true if the sampler is enabled. + */ + public boolean isEnabled() + { + return endTimeNanos != DISABLED; + } - public abstract void beginSampling(int capacity, int durationMillis); + public void disable() + { + endTimeNanos = DISABLED; + } + /** + * @return true if the sampler is active. + * A sampler is active only if it is enabled and the current time is within the `durationMillis` when beginning sampling. + */ + public boolean isActive() + { + return isEnabled() && clock.now() <= endTimeNanos; + } + + /** + * Update the end time for the sampler. Implicitly, calling this method enables the sampler. + */ + public void updateEndTime(long endTimeMillis) + { + this.endTimeNanos = endTimeMillis; + } + + /** + * Begin sampling with the configured capacity and duration + * @param capacity Number of sample items to keep in memory, the lower this is + * the less accurate results are. For best results use value + * close to cardinality, but understand the memory trade offs. + * @param durationMillis Upperbound duration in milliseconds for sampling. The sampler + * stops accepting new samples after exceeding the duration + * even if {@link #finishSampling(int)}} is not called. + */ + public abstract void beginSampling(int capacity, long durationMillis); + + /** + * Stop sampling and return the results + * @param count The number of the samples requested to retrieve from the sampler + * @return a list of samples, the size is the minimum of the total samples and {@param count}. + */ public abstract List> finishSampling(int count); public abstract String toString(T value); diff --git a/src/java/org/apache/cassandra/metrics/SamplingManager.java b/src/java/org/apache/cassandra/metrics/SamplingManager.java new file mode 100644 index 000000000000..37d8d355c2b8 --- /dev/null +++ b/src/java/org/apache/cassandra/metrics/SamplingManager.java @@ -0,0 +1,391 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.metrics; + +import java.io.ByteArrayOutputStream; +import java.io.PrintStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.HashSet; +import java.util.List; +import java.util.Map; +import java.util.Objects; +import java.util.Set; +import java.util.concurrent.ConcurrentHashMap; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicBoolean; +import java.util.stream.Collectors; +import java.util.stream.StreamSupport; +import javax.management.openmbean.CompositeData; +import javax.management.openmbean.OpenDataException; + +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; + +import org.apache.cassandra.concurrent.ScheduledExecutors; +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.tools.nodetool.ProfileLoad; +import org.apache.cassandra.tools.nodetool.formatter.TableBuilder; +import org.apache.cassandra.utils.Pair; + +public class SamplingManager +{ + private static final Logger logger = LoggerFactory.getLogger(SamplingManager.class); + + /** + * Tracks the active scheduled sampling tasks. + * The key of the map is a {@link JobId}, which is effectively a keyspace + table abstracted behind some syntactic + * sugar so we can use them without peppering Pairs throughout this class. Both keyspace and table are nullable, + * a paradigm we inherit from {@link ProfileLoad} so need to accommodate here. + * + * The value of the map is the current scheduled task. + */ + private final ConcurrentHashMap> activeSamplingTasks = new ConcurrentHashMap<>(); + + /** Tasks that are actively being cancelled */ + private final Set cancelingTasks = ConcurrentHashMap.newKeySet(); + + public static String formatResult(ResultBuilder resultBuilder) + { + ByteArrayOutputStream baos = new ByteArrayOutputStream(); + try (PrintStream ps = new PrintStream(baos)) + { + for (Sampler.SamplerType samplerType : Sampler.SamplerType.values()) + { + samplerType.format(resultBuilder, ps); + } + return baos.toString(); + } + } + + public static Iterable getTables(String ks, String table) + { + // null KEYSPACE == all the tables + if (ks == null) + return ColumnFamilyStore.all(); + + Keyspace keyspace = Keyspace.open(ks); + + // KEYSPACE defined w/null table == all the tables on that KEYSPACE + if (table == null) + return keyspace.getColumnFamilyStores(); + // Or we just have a specific ks+table combo we're looking to profile + else + return Collections.singletonList(keyspace.getColumnFamilyStore(table)); + } + + /** + * Register the samplers for the keyspace and table. + * @param ks Keyspace. Nullable. If null, the scheduled sampling is on all keyspaces and tables + * @param table Nullable. If null, the scheduled sampling is on all tables of the specified keyspace + * @param duration Duration of each scheduled sampling job in milliseconds + * @param interval Interval of each scheduled sampling job in milliseconds + * @param capacity Capacity of the sampler, higher for more accuracy + * @param count Number of the top samples to list + * @param samplers a list of samplers to enable + * @return true if the scheduled sampling is started successfully. Otherwise return fasle + */ + public boolean register(String ks, String table, int duration, int interval, int capacity, int count, List samplers) + { + JobId jobId = new JobId(ks, table); + logger.info("Registering samplers {} for {}", samplers, jobId); + + if (!canSchedule(jobId)) + { + logger.info("Unable to register {} due to existing ongoing sampling.", jobId); + return false; + } + + // 'begin' tasks are chained to finish before their paired 'finish' + activeSamplingTasks.put(jobId, ScheduledExecutors.optionalTasks.submit( + createSamplingBeginRunnable(jobId, getTables(ks, table), duration, interval, capacity, count, samplers) + )); + return true; + } + + public boolean unregister(String ks, String table) + { + // unregister all + // return true when all tasks are cancelled successfully + if (ks == null && table == null) + { + boolean res = true; + for (JobId id : activeSamplingTasks.keySet()) + { + res = cancelTask(id) & res; + } + return res; + } + else + { + return cancelTask(new JobId(ks, table)); + } + } + + public List allJobs() + { + return jobIds().stream() + .map(JobId::toString) + .collect(Collectors.toList()); + } + + private Set jobIds() + { + Set all = new HashSet<>(); + all.addAll(activeSamplingTasks.keySet()); + all.addAll(cancelingTasks); + return all; + } + + /** + * Validate if a schedule on the keyspace and table is permitted + * @param jobId + * @return true if possible, false if there are overlapping tables already being sampled + */ + private boolean canSchedule(JobId jobId) + { + Set allJobIds = jobIds(); + // There is a schedule that works on all tables. Overlapping guaranteed. + if (allJobIds.contains(JobId.ALL_KS_AND_TABLES) || (!allJobIds.isEmpty() && jobId.equals(JobId.ALL_KS_AND_TABLES))) + return false; + // there is an exactly duplicated schedule + else if (allJobIds.contains(jobId)) + return false; + else + // make sure has no overlapping tables under the keyspace + return !allJobIds.contains(JobId.createForAllTables(jobId.keyspace)); + } + + /** + * Cancel a task by its id. The corresponding task will be stopped once its final sampling completes. + * @param jobId + * @return true if the task exists, false if not found + */ + private boolean cancelTask(JobId jobId) + { + Future task = activeSamplingTasks.remove(jobId); + if (task != null) + cancelingTasks.add(jobId); + return task != null; + } + + /** + * Begin sampling and schedule a future task to end the sampling task + */ + private Runnable createSamplingBeginRunnable(JobId jobId, Iterable tables, int duration, int interval, int capacity, int count, List samplers) + { + return () -> + { + if (cancelingTasks.contains(jobId)) + { + logger.debug("The sampling job of {} is currently canceling. Not issuing a new run.", jobId); + activeSamplingTasks.remove(jobId); + return; + } + List tableNames = StreamSupport.stream(tables.spliterator(), false) + .map(cfs -> String.format("%s.%s", cfs.keyspace, cfs.name)) + .collect(Collectors.toList()); + logger.info("Starting to sample tables {} with the samplers {} for {} ms", tableNames, samplers, duration); + for (String sampler : samplers) + { + for (ColumnFamilyStore cfs : tables) + { + cfs.beginLocalSampling(sampler, capacity, duration); + } + } + Future fut = ScheduledExecutors.optionalTasks.schedule( + createSamplingEndRunnable(jobId, tables, duration, interval, capacity, count, samplers), + interval, + TimeUnit.MILLISECONDS); + // reached to the end of the current runnable + // update the referenced future to SamplingFinish + activeSamplingTasks.put(jobId, fut); + }; + } + + /** + * Finish the sampling and begin a new one immediately after. + * + * NOTE: Do not call this outside the context of {@link this#createSamplingBeginRunnable}, as we need to preserve + * ordering between a "start" and "end" runnable + */ + private Runnable createSamplingEndRunnable(JobId jobId, Iterable tables, int duration, int interval, int capacity, int count, List samplers) + { + return () -> + { + Map> results = new HashMap<>(); + for (String sampler : samplers) + { + List topk = new ArrayList<>(); + for (ColumnFamilyStore cfs : tables) + { + try + { + topk.addAll(cfs.finishLocalSampling(sampler, count)); + } + catch (OpenDataException e) + { + logger.warn("Failed to retrieve the sampled data. Abort the background sampling job: {}.", jobId, e); + activeSamplingTasks.remove(jobId); + cancelingTasks.remove(jobId); + return; + } + } + + topk.sort((left, right) -> Long.compare((long) right.get("count"), (long) left.get("count"))); + // sublist is not serializable for jmx + topk = new ArrayList<>(topk.subList(0, Math.min(topk.size(), count))); + results.put(sampler, topk); + } + AtomicBoolean first = new AtomicBoolean(false); + ResultBuilder rb = new ResultBuilder(first, results, samplers); + logger.info(formatResult(rb)); + + // If nobody has canceled us, we ping-pong back to a "begin" runnable to run another profile load + if (!cancelingTasks.contains(jobId)) + { + Future fut = ScheduledExecutors.optionalTasks.submit( + createSamplingBeginRunnable(jobId, tables, duration, interval, capacity, count, samplers)); + activeSamplingTasks.put(jobId, fut); + } + // If someone *has* canceled us, we need to remove the runnable from activeSampling and also remove the + // cancellation sentinel so subsequent re-submits of profiling don't get blocked immediately + else + { + logger.info("The sampling job {} has been cancelled.", jobId); + activeSamplingTasks.remove(jobId); + cancelingTasks.remove(jobId); + } + }; + } + + private static class JobId + { + public static final JobId ALL_KS_AND_TABLES = new JobId(null, null); + + public final String keyspace; + public final String table; + + public JobId(String ks, String tb) + { + keyspace = ks; + table = tb; + } + + public static JobId createForAllTables(String keyspace) + { + return new JobId(keyspace, null); + } + + @Override + public String toString() + { + return maybeWildCard(keyspace) + '.' + maybeWildCard(table); + } + + private String maybeWildCard(String input) + { + return input == null ? "*" : input; + } + + @Override + public boolean equals(Object o) + { + if (this == o) return true; + if (o == null || getClass() != o.getClass()) return false; + JobId jobId = (JobId) o; + return Objects.equals(keyspace, jobId.keyspace) && Objects.equals(table, jobId.table); + } + + @Override + public int hashCode() + { + return Objects.hash(keyspace, table); + } + } + + public static class ResultBuilder + { + protected Sampler.SamplerType type; + protected String description; + protected AtomicBoolean first; + protected Map> results; + protected List targets; + protected List> dataKeys; + + public ResultBuilder(AtomicBoolean first, Map> results, List targets) + { + this.first = first; + this.results = results; + this.targets = targets; + this.dataKeys = new ArrayList<>(); + this.dataKeys.add(Pair.create(" ", " ")); + } + + public SamplingManager.ResultBuilder forType(Sampler.SamplerType type, String description) + { + SamplingManager.ResultBuilder rb = new SamplingManager.ResultBuilder(first, results, targets); + rb.type = type; + rb.description = description; + return rb; + } + + public SamplingManager.ResultBuilder addColumn(String title, String key) + { + this.dataKeys.add(Pair.create(title, key)); + return this; + } + + protected String get(CompositeData cd, String key) + { + if (cd.containsKey(key)) + return cd.get(key).toString(); + return key; + } + + public void print(PrintStream ps) + { + if (targets.contains(type.toString())) + { + if (!first.get()) + ps.println(); + first.set(false); + ps.println(description + ':'); + TableBuilder out = new TableBuilder(); + out.add(dataKeys.stream().map(p -> p.left).collect(Collectors.toList()).toArray(new String[] {})); + List topk = results.get(type.toString()); + for (CompositeData cd : topk) + { + out.add(dataKeys.stream().map(p -> get(cd, p.right)).collect(Collectors.toList()).toArray(new String[] {})); + } + if (topk.size() == 0) + { + ps.println(" Nothing recorded during sampling period..."); + } + else + { + out.printTo(ps); + } + } + } + } +} diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java index b7756c0c2127..80ffa80a8c57 100644 --- a/src/java/org/apache/cassandra/service/StorageService.java +++ b/src/java/org/apache/cassandra/service/StorageService.java @@ -69,6 +69,8 @@ import org.apache.cassandra.fql.FullQueryLoggerOptionsCompositeData; import org.apache.cassandra.io.util.File; import org.apache.cassandra.locator.ReplicaCollection.Builder.Conflict; +import org.apache.cassandra.metrics.Sampler; +import org.apache.cassandra.metrics.SamplingManager; import org.apache.cassandra.schema.Keyspaces; import org.apache.cassandra.service.disk.usage.DiskUsageBroadcaster; import org.apache.cassandra.service.snapshot.SnapshotLoader; @@ -142,6 +144,7 @@ import org.apache.cassandra.utils.progress.jmx.JMXBroadcastExecutor; import org.apache.cassandra.utils.progress.jmx.JMXProgressSupport; +import static com.google.common.base.Preconditions.checkArgument; import static com.google.common.collect.Iterables.transform; import static com.google.common.collect.Iterables.tryFind; import static java.util.Arrays.asList; @@ -225,6 +228,8 @@ private static int getSchemaDelay() public static final StorageService instance = new StorageService(); + private final SamplingManager samplingManager = new SamplingManager(); + @Deprecated public boolean isInShutdownHook() { @@ -5978,17 +5983,23 @@ public List sampleKeyRange() // do not rename to getter - see CASSANDRA- return sampledKeys; } + @Override + public Map> samplePartitions(int duration, int capacity, int count, List samplers) throws OpenDataException { + return samplePartitions(null, duration, capacity, count, samplers); + } + /* * { "sampler_name": [ {table: "", count: i, error: i, value: ""}, ... ] } */ @Override - public Map> samplePartitions(int durationMillis, int capacity, int count, - List samplers) throws OpenDataException + public Map> samplePartitions(String keyspace, int durationMillis, int capacity, int count, + List samplers) throws OpenDataException { ConcurrentHashMap> result = new ConcurrentHashMap<>(); + Iterable tables = SamplingManager.getTables(keyspace, null); for (String sampler : samplers) { - for (ColumnFamilyStore table : ColumnFamilyStore.all()) + for (ColumnFamilyStore table : tables) { table.beginLocalSampling(sampler, capacity, durationMillis); } @@ -5998,7 +6009,7 @@ public Map> samplePartitions(int durationMillis, int for (String sampler : samplers) { List topk = new ArrayList<>(); - for (ColumnFamilyStore table : ColumnFamilyStore.all()) + for (ColumnFamilyStore table : tables) { topk.addAll(table.finishLocalSampling(sampler, count)); } @@ -6016,6 +6027,44 @@ public int compare(CompositeData left, CompositeData right) return result; } + @Override // Note from parent javadoc: ks and table are nullable + public boolean startSamplingPartitions(String ks, String table, int duration, int interval, int capacity, int count, List samplers) + { + Preconditions.checkArgument(duration > 0, "Sampling duration %s must be positive.", duration); + + Preconditions.checkArgument(interval <= 0 || interval >= duration, + "Sampling interval %s should be greater then or equals to duration %s if defined.", + interval, duration); + + Preconditions.checkArgument(capacity > 0 && capacity <= 1024, + "Sampling capacity %s must be positive and the max value is 1024 (inclusive).", + capacity); + + Preconditions.checkArgument(count > 0 && count < capacity, + "Sampling count %s must be positive and smaller than capacity %s.", + count, capacity); + + Preconditions.checkArgument(!samplers.isEmpty(), "Samplers cannot be empty."); + + Set available = EnumSet.allOf(Sampler.SamplerType.class); + samplers.forEach((x) -> checkArgument(available.contains(Sampler.SamplerType.valueOf(x)), + "'%s' sampler is not available from: %s", + x, Arrays.toString(Sampler.SamplerType.values()))); + return samplingManager.register(ks, table, duration, interval, capacity, count, samplers); + } + + @Override + public boolean stopSamplingPartitions(String ks, String table) + { + return samplingManager.unregister(ks, table); + } + + @Override + public List getSampleTasks() + { + return samplingManager.allJobs(); + } + public void rebuildSecondaryIndex(String ksName, String cfName, String... idxNames) { String[] indices = asList(idxNames).stream() diff --git a/src/java/org/apache/cassandra/service/StorageServiceMBean.java b/src/java/org/apache/cassandra/service/StorageServiceMBean.java index ac2ff68411ae..02485274a385 100644 --- a/src/java/org/apache/cassandra/service/StorageServiceMBean.java +++ b/src/java/org/apache/cassandra/service/StorageServiceMBean.java @@ -795,6 +795,36 @@ default int upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, public Map> samplePartitions(int duration, int capacity, int count, List samplers) throws OpenDataException; + public Map> samplePartitions(String keyspace, int duration, int capacity, int count, List samplers) throws OpenDataException; + + /** + * Start a scheduled sampling + * @param ks Keyspace. Nullable. If null, the scheduled sampling is on all keyspaces and tables + * @param table Nullable. If null, the scheduled sampling is on all tables of the specified keyspace + * @param duration Duration of each scheduled sampling job in milliseconds + * @param interval Interval of each scheduled sampling job in milliseconds + * @param capacity Capacity of the sampler, higher for more accuracy + * @param count Number of the top samples to list + * @param samplers a list of samplers to enable + * @return true if the scheduled sampling is started successfully. Otherwise return false + */ + public boolean startSamplingPartitions(String ks, String table, int duration, int interval, int capacity, int count, List samplers) throws OpenDataException; + + /** + * Stop a scheduled sampling + * @param ks Keyspace. Nullable. If null, the scheduled sampling is on all keysapces and tables + * @param table Nullable. If null, the scheduled sampling is on all tables of the specified keyspace + * @return true if the scheduled sampling is stopped. False is returned if the sampling task is not found + */ + public boolean stopSamplingPartitions(String ks, String table) throws OpenDataException; + + /** + * @return a list of qualified table names that have active scheduled sampling tasks. The format of the name is `KEYSPACE.TABLE` + * The wild card symbol (*) indicates all keyspace/table. For example, "*.*" indicates all tables in all keyspaces. "foo.*" indicates + * all tables under keyspace 'foo'. "foo.bar" indicates the scheduled sampling is enabled for the table 'bar' + */ + public List getSampleTasks(); + /** * Returns the configured tracing probability. */ diff --git a/src/java/org/apache/cassandra/tools/NodeProbe.java b/src/java/org/apache/cassandra/tools/NodeProbe.java index d8dc2ab8e203..daf4eb251834 100644 --- a/src/java/org/apache/cassandra/tools/NodeProbe.java +++ b/src/java/org/apache/cassandra/tools/NodeProbe.java @@ -498,9 +498,29 @@ public void repairAsync(final PrintStream out, final String keyspace, Map> getPartitionSample(int capacity, int durationMillis, int count, List samplers) throws OpenDataException + + public boolean handleScheduledSampling(String ks, + String table, + int capacity, + int count, + int durationMillis, + int intervalMillis, + List samplers, + boolean shouldStop) throws OpenDataException + { + return shouldStop ? + ssProxy.stopSamplingPartitions(ks, table) : + ssProxy.startSamplingPartitions(ks, table, durationMillis, intervalMillis, capacity, count, samplers); + } + + public List getSampleTasks() + { + return ssProxy.getSampleTasks(); + } + + public Map> getPartitionSample(String ks, int capacity, int durationMillis, int count, List samplers) throws OpenDataException { - return ssProxy.samplePartitions(durationMillis, capacity, count, samplers); + return ssProxy.samplePartitions(ks, durationMillis, capacity, count, samplers); } public Map> getPartitionSample(String ks, String cf, int capacity, int durationMillis, int count, List samplers) throws OpenDataException diff --git a/src/java/org/apache/cassandra/tools/nodetool/ProfileLoad.java b/src/java/org/apache/cassandra/tools/nodetool/ProfileLoad.java index 487f14a0a0f4..45cade7560f3 100644 --- a/src/java/org/apache/cassandra/tools/nodetool/ProfileLoad.java +++ b/src/java/org/apache/cassandra/tools/nodetool/ProfileLoad.java @@ -17,36 +17,36 @@ */ package org.apache.cassandra.tools.nodetool; -import static com.google.common.base.Preconditions.checkArgument; -import static org.apache.commons.lang3.StringUtils.join; - import java.io.PrintStream; import java.util.ArrayList; import java.util.Arrays; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.atomic.AtomicBoolean; import java.util.stream.Collectors; - import javax.management.openmbean.CompositeData; import javax.management.openmbean.OpenDataException; -import org.apache.cassandra.metrics.Sampler.SamplerType; -import org.apache.cassandra.tools.NodeProbe; -import org.apache.cassandra.tools.NodeTool.NodeToolCmd; -import org.apache.cassandra.tools.nodetool.formatter.TableBuilder; -import org.apache.cassandra.utils.Pair; - import com.google.common.collect.Lists; import io.airlift.airline.Arguments; import io.airlift.airline.Command; import io.airlift.airline.Option; +import org.apache.cassandra.metrics.Sampler.SamplerType; +import org.apache.cassandra.metrics.SamplingManager; +import org.apache.cassandra.tools.NodeProbe; +import org.apache.cassandra.tools.NodeTool.NodeToolCmd; +import org.apache.cassandra.utils.Pair; + +import static com.google.common.base.Preconditions.checkArgument; +import static com.google.common.base.Preconditions.checkState; +import static org.apache.commons.lang3.StringUtils.join; @Command(name = "profileload", description = "Low footprint profiling of activity for a period of time") public class ProfileLoad extends NodeToolCmd { - @Arguments(usage = " ", description = "The keyspace, column family name, and duration in milliseconds") + @Arguments(usage = " ", description = "The keyspace, column family name, and duration in milliseconds (Default: 10000)") private List args = new ArrayList<>(); @Option(name = "-s", description = "Capacity of the sampler, higher for more accuracy (Default: 256)") @@ -58,27 +58,59 @@ public class ProfileLoad extends NodeToolCmd @Option(name = "-a", description = "Comma separated list of samplers to use (Default: all)") private String samplers = join(SamplerType.values(), ','); + @Option(name = {"-i", "--interval"}, description = "Schedule a new job that samples every interval milliseconds (Default: disabled) in the background") + private int intervalMillis = -1; // -1 for disabled. + + @Option(name = {"-t", "--stop"}, description = "Stop the scheduled sampling job identified by and . Jobs are stopped until the last schedules complete.") + private boolean shouldStop = false; + + @Option(name = {"-l", "--list"}, description = "List the scheduled sampling jobs") + private boolean shouldList = false; + @Override public void execute(NodeProbe probe) { - checkArgument(args.size() == 3 || args.size() == 1 || args.size() == 0, "Invalid arguments, either [keyspace table duration] or [duration] or no args"); - checkArgument(topCount < capacity, "TopK count (-k) option must be smaller then the summary capacity (-s)"); + checkArgument(args.size() == 3 || args.size() == 2 || args.size() == 1 || args.size() == 0, + "Invalid arguments, either [keyspace table/* duration] or [keyspace table/*] or [duration] or no args.\n" + + "Optionally, use * to represent all tables under the keyspace."); + checkArgument(topCount > 0, "TopK count (-k) option must have positive value"); + checkArgument(topCount < capacity, + "TopK count (-k) option must be smaller then the summary capacity (-s)"); + checkArgument(capacity <= 1024, "Capacity (-s) cannot exceed 1024."); String keyspace = null; String table = null; - Integer durationMillis = 10000; - if(args.size() == 3) + int durationMillis = 10000; + /* There are 3 possible outcomes after processing the args. + * - keyspace == null && table == null. We need to sample all tables + * - keyspace == KEYSPACE && table == *. We need to sample all tables under the specified KEYSPACE + * - keyspace = KEYSPACE && table == TABLE. Sample the specific KEYSPACE.table combination + */ + if (args.size() == 3) + { + keyspace = args.get(0); + table = args.get(1); + durationMillis = Integer.parseInt(args.get(2)); + } + else if (args.size() == 2) { keyspace = args.get(0); table = args.get(1); - durationMillis = Integer.valueOf(args.get(2)); } else if (args.size() == 1) { - durationMillis = Integer.valueOf(args.get(0)); + durationMillis = Integer.parseInt(args.get(0)); } + keyspace = nullifyWildcard(keyspace); + table = nullifyWildcard(table); + + checkArgument(durationMillis > 0, "Duration: %s must be positive", durationMillis); + + checkArgument(!hasInterval() || intervalMillis >= durationMillis, + "Invalid scheduled sampling interval. Expecting interval >= duration, but interval: %s ms; duration: %s ms", + intervalMillis, durationMillis); // generate the list of samplers List targets = Lists.newArrayList(); - List available = Arrays.stream(SamplerType.values()).map(Enum::toString).collect(Collectors.toList()); + Set available = Arrays.stream(SamplerType.values()).map(Enum::toString).collect(Collectors.toSet()); for (String s : samplers.split(",")) { String sampler = s.trim().toUpperCase(); @@ -86,108 +118,70 @@ else if (args.size() == 1) targets.add(sampler); } + PrintStream out = probe.output().out; + Map> results; try { - if (keyspace == null) - results = probe.getPartitionSample(capacity, durationMillis, topCount, targets); + // handle scheduled samplings, i.e. start or stop + if (hasInterval() || shouldStop) + { + // keyspace and table are nullable + boolean opSuccess = probe.handleScheduledSampling(keyspace, table, capacity, topCount, durationMillis, intervalMillis, targets, shouldStop); + if (!opSuccess) + { + if (shouldStop) + out.printf("Unable to stop the non-existent scheduled sampling for keyspace: %s, table: %s%n", keyspace, table); + else + out.printf("Unable to schedule sampling for keyspace: %s, table: %s due to existing samplings. " + + "Stop the existing sampling jobs first.%n", keyspace, table); + } + return; + } + else if (shouldList) + { + List> sampleTasks = new ArrayList<>(); + int maxKsLength = "KEYSPACE".length(); + int maxTblLength = "TABLE".length(); + for (String fullTableName : probe.getSampleTasks()) + { + String[] parts = fullTableName.split("\\."); + checkState(parts.length == 2, "Unable to parse the full table name: %s", fullTableName); + sampleTasks.add(Pair.create(parts[0], parts[1])); + maxKsLength = Math.max(maxKsLength, parts[0].length()); + } + // print the header line and put enough space between KEYSPACE AND TABLE. + String lineFormat = "%" + maxKsLength + "s %" + maxTblLength + "s%n"; + out.printf(lineFormat, "KEYSPACE", "TABLE"); + sampleTasks.forEach(pair -> out.printf(lineFormat, pair.left, pair.right)); + return; + } else - results = probe.getPartitionSample(keyspace, table, capacity, durationMillis, topCount, targets); - - } catch (OpenDataException e) + { + // blocking sample all the tables or all the tables under a keyspace + if (keyspace == null || table == null) + results = probe.getPartitionSample(keyspace, capacity, durationMillis, topCount, targets); + else // blocking sample the specific table + results = probe.getPartitionSample(keyspace, table, capacity, durationMillis, topCount, targets); + } + } + catch (OpenDataException e) { throw new RuntimeException(e); } AtomicBoolean first = new AtomicBoolean(true); - ResultBuilder rb = new ResultBuilder(first, results, targets); - - for(String sampler : Lists.newArrayList("READS", "WRITES", "CAS_CONTENTIONS")) - { - rb.forType(SamplerType.valueOf(sampler), "Frequency of " + sampler.toLowerCase().replaceAll("_", " ") + " by partition") - .addColumn("Table", "table") - .addColumn("Partition", "value") - .addColumn("Count", "count") - .addColumn("+/-", "error") - .print(probe.output().out); - } - - rb.forType(SamplerType.WRITE_SIZE, "Max mutation size by partition") - .addColumn("Table", "table") - .addColumn("Partition", "value") - .addColumn("Bytes", "count") - .print(probe.output().out); - - rb.forType(SamplerType.LOCAL_READ_TIME, "Longest read query times") - .addColumn("Query", "value") - .addColumn("Microseconds", "count") - .print(probe.output().out); + SamplingManager.ResultBuilder rb = new SamplingManager.ResultBuilder(first, results, targets); + out.println(SamplingManager.formatResult(rb)); } - private class ResultBuilder + private boolean hasInterval() { - private SamplerType type; - private String description; - private AtomicBoolean first; - private Map> results; - private List targets; - private List> dataKeys; - - public ResultBuilder(AtomicBoolean first, Map> results, List targets) - { - super(); - this.first = first; - this.results = results; - this.targets = targets; - this.dataKeys = new ArrayList<>(); - this.dataKeys.add(Pair.create(" ", " ")); - } - - public ResultBuilder forType(SamplerType type, String description) - { - ResultBuilder rb = new ResultBuilder(first, results, targets); - rb.type = type; - rb.description = description; - return rb; - } - - public ResultBuilder addColumn(String title, String key) - { - this.dataKeys.add(Pair.create(title, key)); - return this; - } - - private String get(CompositeData cd, String key) - { - if (cd.containsKey(key)) - return cd.get(key).toString(); - return key; - } + return intervalMillis != -1; + } - public void print(PrintStream outStream) - { - if (targets.contains(type.toString())) - { - if (!first.get()) - outStream.println(); - first.set(false); - outStream.println(description + ':'); - TableBuilder out = new TableBuilder(); - out.add(dataKeys.stream().map(p -> p.left).collect(Collectors.toList()).toArray(new String[] {})); - List topk = results.get(type.toString()); - for (CompositeData cd : topk) - { - out.add(dataKeys.stream().map(p -> get(cd, p.right)).collect(Collectors.toList()).toArray(new String[] {})); - } - if (topk.size() == 0) - { - outStream.println(" Nothing recorded during sampling period..."); - } - else - { - out.printTo(outStream); - } - } - } + private String nullifyWildcard(String input) + { + return input != null && input.equals("*") ? null : input; } } diff --git a/test/distributed/org/apache/cassandra/distributed/test/ProfileLoadTest.java b/test/distributed/org/apache/cassandra/distributed/test/ProfileLoadTest.java new file mode 100644 index 000000000000..a83ea7cedc74 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/ProfileLoadTest.java @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test; + +import com.google.common.util.concurrent.Uninterruptibles; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.junit.Assert; +import org.junit.Test; + +import java.io.IOException; +import java.util.List; +import java.util.Random; +import java.util.concurrent.TimeUnit; + +public class ProfileLoadTest extends TestBaseImpl +{ + @Test + public void testScheduledSamplingTaskLogs() throws IOException + { + try (Cluster cluster = init(Cluster.build(1).start())) + { + cluster.schemaChange(withKeyspace("CREATE TABLE %s.tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck));")); + + // start the scheduled profileload task that samples for 1 second and every second. + cluster.get(1).nodetoolResult("profileload", "1000", "-i", "1000").asserts().success(); + + Random rnd = new Random(); + // 800 * 2ms = 1.6 seconds. It logs every second. So it logs at least once. + for (int i = 0; i < 800; i++) + { + cluster.coordinator(1) + .execute(withKeyspace("INSERT INTO %s.tbl (pk, ck, v) VALUES (?,?,?)"), + ConsistencyLevel.QUORUM, rnd.nextInt(), rnd.nextInt(), i); + Uninterruptibles.sleepUninterruptibly(2, TimeUnit.MILLISECONDS); + } + // --list should display all active tasks. + String expectedOutput = String.format("KEYSPACE TABLE%n" + "%8s %5s", "*", "*"); + cluster.get(1).nodetoolResult("profileload", "--list") + .asserts() + .success() + .stdoutContains(expectedOutput); + + // loop assert the log contains the frequency readout; give this 15 seconds which should be plenty of time + // even on very badly underprovisioned environments + int timeout = 15; + boolean testPassed = false; + while (timeout-- > 0) + { + List freqHeadings = cluster.get(1) + .logs() + .grep("Frequency of (reads|writes|cas contentions) by partition") + .getResult(); + if (freqHeadings.size() > 3) + { + testPassed = true; + break; + } + Uninterruptibles.sleepUninterruptibly(1, TimeUnit.SECONDS); + } + Assert.assertTrue("The scheduled task should at least run and log once", testPassed); + + List startSamplingLogs = cluster.get(1) + .logs() + .grep("Starting to sample tables") + .getResult(); + Assert.assertTrue("It should start sampling at least once", startSamplingLogs.size() > 0); + + // stop the scheduled sampling + cluster.get(1).nodetoolResult("profileload", "--stop").asserts().success(); + + // wait for the last schedule to be stopped. --list should list nothing after stopping + assertListEmpty(cluster.get(1)); + + // schedule on the specific table + cluster.get(1).nodetoolResult("profileload", KEYSPACE, "tbl", "1000", "-i", "1000").asserts().success(); + expectedOutput = String.format("%" + KEYSPACE.length() + "s %5s%n" + + "%s %5s", + "KEYSPACE", "TABLE", + KEYSPACE, "tbl"); + cluster.get(1).nodetoolResult("profileload", "--list") + .asserts() + .success() + .stdoutContains(expectedOutput); + // stop all should stop the task scheduled with the specific table + cluster.get(1).nodetoolResult("profileload", "--stop") + .asserts().success(); + assertListEmpty(cluster.get(1)); + } + } + + @Test + public void testPreventDuplicatedSchedule() throws IOException + { + try (Cluster cluster = init(Cluster.build(1).start())) + { + cluster.schemaChange(withKeyspace("CREATE TABLE %s.tbl (pk int, ck int, v int, PRIMARY KEY (pk, ck));")); + + // New sampling; we are good + cluster.get(1).nodetoolResult("profileload", KEYSPACE, "tbl", "1000", "-i", "1000") + .asserts() + .success() + .stdoutNotContains("Unable to schedule sampling for keyspace"); + + // Duplicated sampling (against the same table) but different interval. Nodetool should reject + cluster.get(1).nodetoolResult("profileload", KEYSPACE, "tbl", "1000", "-i", "1000") + .asserts() + .success() + .stdoutContains("Unable to schedule sampling for keyspace"); + + // The "sampling all" request creates overlaps, so it should be rejected too + cluster.get(1).nodetoolResult("profileload", "1000", "-i", "1000") + .asserts() + .success() + .stdoutContains("Unable to schedule sampling for keyspace"); + + cluster.get(1).nodetoolResult("profileload", KEYSPACE, "tbl", "--stop").asserts().success(); + assertListEmpty(cluster.get(1)); + + cluster.get(1).nodetoolResult("profileload", "nonexistks", "nonexisttbl", "--stop") + .asserts() + .success() + .stdoutContains("Unable to stop the non-existent scheduled sampling"); + } + } + + private void assertListEmpty(IInvokableInstance instance) + { + Uninterruptibles.sleepUninterruptibly(1500, TimeUnit.MILLISECONDS); + Assert.assertEquals("--list should list nothing", + "KEYSPACE TABLE\n", + instance.nodetoolResult("profileload", "--list").getStdout()); + } +} diff --git a/test/unit/org/apache/cassandra/metrics/SamplerTest.java b/test/unit/org/apache/cassandra/metrics/SamplerTest.java index dba19a306432..862abbb05b6f 100644 --- a/test/unit/org/apache/cassandra/metrics/SamplerTest.java +++ b/test/unit/org/apache/cassandra/metrics/SamplerTest.java @@ -79,7 +79,12 @@ public boolean isEnabled() return true; } - public void beginSampling(int capacity, int durationMillis) + public boolean isActive() + { + return true; + } + + public void beginSampling(int capacity, long durationMillis) { } diff --git a/test/unit/org/apache/cassandra/tools/TopPartitionsTest.java b/test/unit/org/apache/cassandra/tools/TopPartitionsTest.java index 975ad15776cf..934d4d814c7b 100644 --- a/test/unit/org/apache/cassandra/tools/TopPartitionsTest.java +++ b/test/unit/org/apache/cassandra/tools/TopPartitionsTest.java @@ -18,16 +18,20 @@ package org.apache.cassandra.tools; +import java.util.Arrays; +import java.util.Collections; import java.util.List; import java.util.Map; import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.BlockingQueue; import java.util.concurrent.Executors; import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; import javax.management.openmbean.CompositeData; import com.google.common.collect.Lists; +import com.google.common.util.concurrent.Uninterruptibles; import org.junit.BeforeClass; import org.junit.Test; @@ -36,12 +40,18 @@ import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.SystemKeyspace; import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.metrics.Sampler; import org.apache.cassandra.service.StorageService; import static java.lang.String.format; import static org.apache.cassandra.cql3.QueryProcessor.executeInternal; import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertFalse; +import static org.junit.Assert.assertTrue; +/** + * Includes test cases for both the 'toppartitions' command and its successor 'profileload' + */ public class TopPartitionsTest { @BeforeClass @@ -59,7 +69,7 @@ public void testServiceTopPartitionsNoArg() throws Exception { try { - q.put(StorageService.instance.samplePartitions(1000, 100, 10, Lists.newArrayList("READS", "WRITES"))); + q.put(StorageService.instance.samplePartitions(null, 1000, 100, 10, Lists.newArrayList("READS", "WRITES"))); } catch (Exception e) { @@ -82,4 +92,30 @@ public void testServiceTopPartitionsSingleTable() throws Exception List result = ColumnFamilyStore.getIfExists("system", "local").finishLocalSampling("READS", 5); assertEquals("If this failed you probably have to raise the beginLocalSampling duration", 1, result.size()); } + + @Test + public void testStartAndStopScheduledSampling() + { + List allSamplers = Arrays.stream(Sampler.SamplerType.values()).map(Enum::toString).collect(Collectors.toList()); + StorageService ss = StorageService.instance; + + assertTrue("Scheduling new sampled tasks should be allowed", + ss.startSamplingPartitions(null, null, 10, 10, 100, 10, allSamplers)); + + assertEquals(Collections.singletonList("*.*"), ss.getSampleTasks()); + + assertFalse("Sampling with duplicate keys should be disallowed", + ss.startSamplingPartitions(null, null, 20, 20, 100, 10, allSamplers)); + + assertTrue("Existing scheduled sampling tasks should be cancellable", ss.stopSamplingPartitions(null, null)); + + int timeout = 10; + while (timeout-- > 0 && ss.getSampleTasks().size() > 0) + Uninterruptibles.sleepUninterruptibly(10, TimeUnit.MILLISECONDS); + + assertEquals("Scheduled sampled tasks should be removed", Collections.emptyList(), ss.getSampleTasks()); + + assertTrue("When nothing is scheduled, you should be able to stop all scheduled sampling tasks", + ss.stopSamplingPartitions(null, null)); + } } From c4b1c0614e42b4ea2064822d31c28aa5d4f1450a Mon Sep 17 00:00:00 2001 From: David Capwell Date: Fri, 19 Aug 2022 16:42:56 -0700 Subject: [PATCH 050/159] Read/Write/Truncate throw RequestFailure in a race condition with callback timeouts, should return Timeout instead patch by David Capwell; reviewed by Caleb Rackliffe for CASSANDRA-17828 --- CHANGES.txt | 1 + .../apache/cassandra/net/RequestCallback.java | 17 ++ .../service/AbstractWriteResponseHandler.java | 40 ++- .../service/TruncateResponseHandler.java | 29 ++- .../cassandra/service/reads/ReadCallback.java | 13 +- .../test/metrics/RequestTimeoutTest.java | 241 ++++++++++++++++++ .../cassandra/utils/AssertionUtils.java | 124 +++++++++ .../cassandra/utils/AssertionUtilsTest.java | 45 ++++ 8 files changed, 483 insertions(+), 27 deletions(-) create mode 100644 test/distributed/org/apache/cassandra/distributed/test/metrics/RequestTimeoutTest.java create mode 100644 test/unit/org/apache/cassandra/utils/AssertionUtils.java create mode 100644 test/unit/org/apache/cassandra/utils/AssertionUtilsTest.java diff --git a/CHANGES.txt b/CHANGES.txt index 36beb3c27f7b..3fd1a8c747aa 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Read/Write/Truncate throw RequestFailure in a race condition with callback timeouts, should return Timeout instead (CASSANDRA-17828) * Add ability to log load profiles at fixed intervals (CASSANDRA-17821) * Protect against Gossip backing up due to a quarantined endpoint without version information (CASSANDRA-17830) * NPE in org.apache.cassandra.cql3.Attributes.getTimeToLive (CASSANDRA-17822) diff --git a/src/java/org/apache/cassandra/net/RequestCallback.java b/src/java/org/apache/cassandra/net/RequestCallback.java index bd14cae1d04d..14e0169b858a 100644 --- a/src/java/org/apache/cassandra/net/RequestCallback.java +++ b/src/java/org/apache/cassandra/net/RequestCallback.java @@ -17,6 +17,8 @@ */ package org.apache.cassandra.net; +import java.util.Map; + import org.apache.cassandra.exceptions.RequestFailureReason; import org.apache.cassandra.locator.InetAddressAndPort; @@ -63,4 +65,19 @@ default boolean trackLatencyForSnitch() return false; } + static boolean isTimeout(Map failureReasonByEndpoint) + { + // The reason that all must be timeout to be called a timeout is as follows + // Assume RF=6, QUORUM, and failureReasonByEndpoint.size() == 3 + // R1 -> TIMEOUT + // R2 -> TIMEOUT + // R3 -> READ_TOO_MANY_TOMBSTONES + // Since we got a reply back, and that was a failure, we should return a failure letting the user know. + // When all failures are a timeout, then this is a race condition with + // org.apache.cassandra.utils.concurrent.Awaitable.await(long, java.util.concurrent.TimeUnit) + // The race is that the message expire path runs and expires all messages, this then casues the condition + // to signal telling the caller "got all replies!". + return failureReasonByEndpoint.values().stream().allMatch(RequestFailureReason.TIMEOUT::equals); + } + } diff --git a/src/java/org/apache/cassandra/service/AbstractWriteResponseHandler.java b/src/java/org/apache/cassandra/service/AbstractWriteResponseHandler.java index 4d75f19bca9d..76ad4c2ff8e0 100644 --- a/src/java/org/apache/cassandra/service/AbstractWriteResponseHandler.java +++ b/src/java/org/apache/cassandra/service/AbstractWriteResponseHandler.java @@ -17,12 +17,16 @@ */ package org.apache.cassandra.service; +import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.atomic.AtomicInteger; import java.util.concurrent.atomic.AtomicIntegerFieldUpdater; +import java.util.function.Function; import java.util.function.Supplier; +import java.util.stream.Collectors; import javax.annotation.Nullable; @@ -113,34 +117,42 @@ public void get() throws WriteTimeoutException, WriteFailureException { long timeoutNanos = currentTimeoutNanos(); - boolean success; + boolean signaled; try { - success = condition.await(timeoutNanos, NANOSECONDS); + signaled = condition.await(timeoutNanos, NANOSECONDS); } catch (InterruptedException e) { throw new UncheckedInterruptedException(e); } - if (!success) - { - int blockedFor = blockFor(); - int acks = ackCount(); - // It's pretty unlikely, but we can race between exiting await above and here, so - // that we could now have enough acks. In that case, we "lie" on the acks count to - // avoid sending confusing info to the user (see CASSANDRA-6491). - if (acks >= blockedFor) - acks = blockedFor - 1; - throw new WriteTimeoutException(writeType, replicaPlan.consistencyLevel(), acks, blockedFor); - } + if (!signaled) + throwTimeout(); if (blockFor() + failures > candidateReplicaCount()) { - throw new WriteFailureException(replicaPlan.consistencyLevel(), ackCount(), blockFor(), writeType, failureReasonByEndpoint); + if (RequestCallback.isTimeout(this.failureReasonByEndpoint.keySet().stream() + .filter(this::waitingFor) // DatacenterWriteResponseHandler filters errors from remote DCs + .collect(Collectors.toMap(Function.identity(), this.failureReasonByEndpoint::get)))) + throwTimeout(); + + throw new WriteFailureException(replicaPlan.consistencyLevel(), ackCount(), blockFor(), writeType, this.failureReasonByEndpoint); } } + private void throwTimeout() + { + int blockedFor = blockFor(); + int acks = ackCount(); + // It's pretty unlikely, but we can race between exiting await above and here, so + // that we could now have enough acks. In that case, we "lie" on the acks count to + // avoid sending confusing info to the user (see CASSANDRA-6491). + if (acks >= blockedFor) + acks = blockedFor - 1; + throw new WriteTimeoutException(writeType, replicaPlan.consistencyLevel(), acks, blockedFor); + } + public final long currentTimeoutNanos() { long requestTimeout = writeType == COUNTER diff --git a/src/java/org/apache/cassandra/service/TruncateResponseHandler.java b/src/java/org/apache/cassandra/service/TruncateResponseHandler.java index 984ba5a10a9b..54b1241006d7 100644 --- a/src/java/org/apache/cassandra/service/TruncateResponseHandler.java +++ b/src/java/org/apache/cassandra/service/TruncateResponseHandler.java @@ -17,7 +17,9 @@ */ package org.apache.cassandra.service; -import java.net.InetAddress; +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeoutException; import java.util.concurrent.atomic.AtomicInteger; @@ -46,7 +48,7 @@ public class TruncateResponseHandler implements RequestCallback failureReasonByEndpoint = new ConcurrentHashMap<>(); public TruncateResponseHandler(int responseCount) { @@ -61,24 +63,31 @@ public TruncateResponseHandler(int responseCount) public void get() throws TimeoutException { long timeoutNanos = getTruncateRpcTimeout(NANOSECONDS) - (nanoTime() - start); - boolean completedInTime; + boolean signaled; try { - completedInTime = condition.await(timeoutNanos, NANOSECONDS); // TODO truncate needs a much longer timeout + signaled = condition.await(timeoutNanos, NANOSECONDS); // TODO truncate needs a much longer timeout } catch (InterruptedException e) { throw new UncheckedInterruptedException(e); } - if (!completedInTime) - { + if (!signaled) throw new TimeoutException("Truncate timed out - received only " + responses.get() + " responses"); - } - if (truncateFailingReplica != null) + if (!failureReasonByEndpoint.isEmpty()) { - throw new TruncateException("Truncate failed on replica " + truncateFailingReplica); + // clone to make sure no race condition happens + Map failureReasonByEndpoint = new HashMap<>(this.failureReasonByEndpoint); + if (RequestCallback.isTimeout(failureReasonByEndpoint)) + throw new TimeoutException("Truncate timed out - received only " + responses.get() + " responses"); + + StringBuilder sb = new StringBuilder("Truncate failed on "); + for (Map.Entry e : failureReasonByEndpoint.entrySet()) + sb.append("replica ").append(e.getKey()).append(" -> ").append(e.getValue()).append(", "); + sb.setLength(sb.length() - 2); + throw new TruncateException(sb.toString()); } } @@ -94,7 +103,7 @@ public void onResponse(Message message) public void onFailure(InetAddressAndPort from, RequestFailureReason failureReason) { // If the truncation hasn't succeeded on some replica, abort and indicate this back to the client. - truncateFailingReplica = from.getAddress(); + failureReasonByEndpoint.put(from, failureReason); condition.signalAll(); } diff --git a/src/java/org/apache/cassandra/service/reads/ReadCallback.java b/src/java/org/apache/cassandra/service/reads/ReadCallback.java index e69e6bd2b96c..c25b1f0f0204 100644 --- a/src/java/org/apache/cassandra/service/reads/ReadCallback.java +++ b/src/java/org/apache/cassandra/service/reads/ReadCallback.java @@ -17,6 +17,7 @@ */ package org.apache.cassandra.service.reads; +import java.util.HashMap; import java.util.Map; import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.TimeUnit; @@ -120,6 +121,12 @@ public void awaitResults() throws ReadFailureException, ReadTimeoutException */ int received = resolver.responses.size(); boolean failed = failures > 0 && (blockFor > received || !resolver.isDataPresent()); + // If all messages came back as a TIMEOUT then signaled=true and failed=true. + // Need to distinguish between a timeout and a failure (network, bad data, etc.), so store an extra field. + // see CASSANDRA-17828 + boolean timedout = !signaled; + if (failed) + timedout = RequestCallback.isTimeout(new HashMap<>(failureReasonByEndpoint)); WarningContext warnings = warningContext; // save the snapshot so abort state is not changed between now and when mayAbort gets called WarningsSnapshot snapshot = null; @@ -138,19 +145,19 @@ public void awaitResults() throws ReadFailureException, ReadTimeoutException if (isTracing()) { String gotData = received > 0 ? (resolver.isDataPresent() ? " (including data)" : " (only digests)") : ""; - Tracing.trace("{}; received {} of {} responses{}", failed ? "Failed" : "Timed out", received, blockFor, gotData); + Tracing.trace("{}; received {} of {} responses{}", !timedout ? "Failed" : "Timed out", received, blockFor, gotData); } else if (logger.isDebugEnabled()) { String gotData = received > 0 ? (resolver.isDataPresent() ? " (including data)" : " (only digests)") : ""; - logger.debug("{}; received {} of {} responses{}", failed ? "Failed" : "Timed out", received, blockFor, gotData); + logger.debug("{}; received {} of {} responses{}", !timedout ? "Failed" : "Timed out", received, blockFor, gotData); } if (snapshot != null) snapshot.maybeAbort(command, replicaPlan().consistencyLevel(), received, blockFor, resolver.isDataPresent(), failureReasonByEndpoint); // Same as for writes, see AbstractWriteResponseHandler - throw failed + throw !timedout ? new ReadFailureException(replicaPlan().consistencyLevel(), received, blockFor, resolver.isDataPresent(), failureReasonByEndpoint) : new ReadTimeoutException(replicaPlan().consistencyLevel(), received, blockFor, resolver.isDataPresent()); } diff --git a/test/distributed/org/apache/cassandra/distributed/test/metrics/RequestTimeoutTest.java b/test/distributed/org/apache/cassandra/distributed/test/metrics/RequestTimeoutTest.java new file mode 100644 index 000000000000..2799fca66a9c --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/metrics/RequestTimeoutTest.java @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test.metrics; + +import java.io.IOException; +import java.lang.reflect.InvocationTargetException; +import java.lang.reflect.Method; +import java.util.Map; +import java.util.concurrent.Callable; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.TimeoutException; +import java.util.concurrent.atomic.AtomicInteger; + +import org.junit.AfterClass; +import org.junit.Before; +import org.junit.BeforeClass; +import org.junit.Test; + +import net.bytebuddy.ByteBuddy; +import net.bytebuddy.dynamic.loading.ClassLoadingStrategy; +import net.bytebuddy.implementation.MethodDelegation; +import net.bytebuddy.implementation.bind.annotation.SuperCall; +import net.bytebuddy.implementation.bind.annotation.SuperMethod; +import net.bytebuddy.implementation.bind.annotation.This; +import org.apache.cassandra.config.Config; +import org.apache.cassandra.cql3.statements.BatchStatement; +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.distributed.test.TestBaseImpl; +import org.apache.cassandra.exceptions.RequestFailureReason; +import org.apache.cassandra.locator.InetAddressAndPort; +import org.apache.cassandra.net.MessagingService; +import org.apache.cassandra.net.RequestCallback; +import org.apache.cassandra.utils.AssertionUtils; +import org.apache.cassandra.exceptions.CasWriteTimeoutException; +import org.apache.cassandra.exceptions.ReadTimeoutException; +import org.apache.cassandra.exceptions.WriteTimeoutException; +import org.apache.cassandra.net.Verb; +import org.apache.cassandra.service.paxos.Paxos; +import org.apache.cassandra.utils.concurrent.Awaitable; +import org.apache.cassandra.utils.concurrent.Condition; +import org.assertj.core.api.Assertions; + +import static net.bytebuddy.matcher.ElementMatchers.named; +import static net.bytebuddy.matcher.ElementMatchers.takesArguments; +import static org.apache.cassandra.utils.AssertionUtils.isThrowable; + +public class RequestTimeoutTest extends TestBaseImpl +{ + private static final AtomicInteger NEXT = new AtomicInteger(0); + public static final int COORDINATOR = 1; + private static Cluster CLUSTER; + + @BeforeClass + public static void init() throws IOException + { + CLUSTER = Cluster.build(3) + .withConfig(c -> c.set("truncate_request_timeout", "10s")) + .withInstanceInitializer(BB::install) + .start(); + init(CLUSTER); + CLUSTER.schemaChange(withKeyspace("CREATE TABLE %s.tbl (pk int PRIMARY KEY, v int)")); + } + + @AfterClass + public static void cleanup() + { + if (CLUSTER != null) + CLUSTER.close(); + } + + @Before + public void before() + { + CLUSTER.get(COORDINATOR).runOnInstance(() -> MessagingService.instance().callbacks.unsafeClear()); + CLUSTER.filters().reset(); + BB.reset(); + } + + @Test + public void insert() + { + CLUSTER.filters().verbs(Verb.MUTATION_REQ.id).to(2).drop(); + Assertions.assertThatThrownBy(() -> CLUSTER.coordinator(COORDINATOR).execute(withKeyspace("INSERT INTO %s.tbl (pk, v) VALUES (?, ?)"), ConsistencyLevel.ALL, NEXT.getAndIncrement(), NEXT.getAndIncrement())) + .is(isThrowable(WriteTimeoutException.class)); + BB.assertIsTimeoutTrue(); + } + + @Test + public void update() + { + CLUSTER.filters().verbs(Verb.MUTATION_REQ.id).to(2).drop(); + Assertions.assertThatThrownBy(() -> CLUSTER.coordinator(COORDINATOR).execute(withKeyspace("UPDATE %s.tbl SET v=? WHERE pk=?"), ConsistencyLevel.ALL, NEXT.getAndIncrement(), NEXT.getAndIncrement())) + .is(isThrowable(WriteTimeoutException.class)); + BB.assertIsTimeoutTrue(); + } + + @Test + public void batchInsert() + { + CLUSTER.filters().verbs(Verb.MUTATION_REQ.id).to(2).drop(); + Assertions.assertThatThrownBy(() -> CLUSTER.coordinator(COORDINATOR).execute(batch(withKeyspace("INSERT INTO %s.tbl (pk, v) VALUES (?, ?)")), ConsistencyLevel.ALL, NEXT.getAndIncrement(), NEXT.getAndIncrement())) + .is(isThrowable(WriteTimeoutException.class)); + BB.assertIsTimeoutTrue(); + } + + @Test + public void rangeSelect() + { + CLUSTER.filters().verbs(Verb.RANGE_REQ.id).to(2).drop(); + Assertions.assertThatThrownBy(() -> CLUSTER.coordinator(COORDINATOR).execute(withKeyspace("SELECT * FROM %s.tbl"), ConsistencyLevel.ALL)) + .is(isThrowable(ReadTimeoutException.class)); + BB.assertIsTimeoutTrue(); + } + + @Test + public void select() + { + CLUSTER.filters().verbs(Verb.READ_REQ.id).to(2).drop(); + Assertions.assertThatThrownBy(() -> CLUSTER.coordinator(COORDINATOR).execute(withKeyspace("SELECT * FROM %s.tbl WHERE pk=?"), ConsistencyLevel.ALL, NEXT.getAndIncrement())) + .is(isThrowable(ReadTimeoutException.class)); + BB.assertIsTimeoutTrue(); + } + + @Test + public void truncate() + { + CLUSTER.filters().verbs(Verb.TRUNCATE_REQ.id).to(2).drop(); + Assertions.assertThatThrownBy(() -> CLUSTER.coordinator(COORDINATOR).execute(withKeyspace("TRUNCATE %s.tbl"), ConsistencyLevel.ALL)) + .is(AssertionUtils.rootCauseIs(TimeoutException.class)); + BB.assertIsTimeoutTrue(); + } + + // don't call BB.assertIsTimeoutTrue(); for CAS, as it has its own logic + + @Test + public void casV2PrepareInsert() + { + withPaxos(Config.PaxosVariant.v2); + + CLUSTER.filters().verbs(Verb.PAXOS2_PREPARE_REQ.id).to(2, 3).drop(); + Assertions.assertThatThrownBy(() -> CLUSTER.coordinator(COORDINATOR).execute(withKeyspace("INSERT INTO %s.tbl (pk, v) VALUES (?, ?) IF NOT EXISTS"), ConsistencyLevel.ALL, NEXT.getAndIncrement(), NEXT.getAndIncrement())) + .is(isThrowable(CasWriteTimeoutException.class)); + } + + @Test + public void casV2PrepareSelect() + { + withPaxos(Config.PaxosVariant.v2); + + CLUSTER.filters().verbs(Verb.PAXOS2_PREPARE_REQ.id).to(2, 3).drop(); + Assertions.assertThatThrownBy(() -> CLUSTER.coordinator(COORDINATOR).execute(withKeyspace("SELECT * FROM %s.tbl WHERE pk=?"), ConsistencyLevel.SERIAL, NEXT.getAndIncrement())) + .is(isThrowable(ReadTimeoutException.class)); // why does write have its own type but not read? + } + + @Test + public void casV2CommitInsert() + { + withPaxos(Config.PaxosVariant.v2); + + CLUSTER.filters().verbs(Verb.PAXOS_COMMIT_REQ.id).to(2, 3).drop(); + Assertions.assertThatThrownBy(() -> CLUSTER.coordinator(COORDINATOR).execute(withKeyspace("INSERT INTO %s.tbl (pk, v) VALUES (?, ?) IF NOT EXISTS"), ConsistencyLevel.ALL, NEXT.getAndIncrement(), NEXT.getAndIncrement())) + .is(isThrowable(CasWriteTimeoutException.class)); + } + + private static void withPaxos(Config.PaxosVariant variant) + { + CLUSTER.forEach(i -> i.runOnInstance(() -> Paxos.setPaxosVariant(variant))); + } + + private static String batch(String cql) + { + return "BEGIN " + BatchStatement.Type.UNLOGGED.name() + " BATCH\n" + cql + "\nAPPLY BATCH"; + } + + public static class BB + { + public static void install(ClassLoader cl, int num) + { + if (num != COORDINATOR) + return; + new ByteBuddy().rebase(Condition.Async.class) + .method(named("await").and(takesArguments(2))) + .intercept(MethodDelegation.to(BB.class)) + .make() + .load(cl, ClassLoadingStrategy.Default.INJECTION); + + new ByteBuddy().rebase(RequestCallback.class) + .method(named("isTimeout")) + .intercept(MethodDelegation.to(BB.class)) + .make() + .load(cl, ClassLoadingStrategy.Default.INJECTION); + } + + public static boolean await(long time, TimeUnit units, @This Awaitable self, @SuperMethod Method method) throws InterruptedException, InvocationTargetException, IllegalAccessException + { + // make sure that the underline condition is met before returnning true + // this way its know that the timeouts triggered! + while (!((boolean) method.invoke(self, time, units))) + { + } + return true; + } + + private static final AtomicInteger TIMEOUTS = new AtomicInteger(0); + public static boolean isTimeout(Map failureReasonByEndpoint, @SuperCall Callable fn) throws Exception + { + boolean timeout = fn.call(); + if (timeout) + TIMEOUTS.incrementAndGet(); + return timeout; + } + + public static void assertIsTimeoutTrue() + { + int timeouts = CLUSTER.get(COORDINATOR).callOnInstance(() -> TIMEOUTS.getAndSet(0)); + Assertions.assertThat(timeouts).isGreaterThan(0); + } + + public static void reset() + { + CLUSTER.get(COORDINATOR).runOnInstance(() -> TIMEOUTS.set(0)); + } + } +} diff --git a/test/unit/org/apache/cassandra/utils/AssertionUtils.java b/test/unit/org/apache/cassandra/utils/AssertionUtils.java new file mode 100644 index 000000000000..d5b1981fc142 --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/AssertionUtils.java @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import com.google.common.base.Throwables; + +import org.assertj.core.api.Condition; + +public class AssertionUtils +{ + private AssertionUtils() + { + } + + /** + * When working with jvm-dtest the thrown error is in a different {@link ClassLoader} causing type checks + * to fail; this method relies on naming instead. + */ + public static Condition is(Class klass) + { + String name = klass.getCanonicalName(); + return new Condition() { + @Override + public boolean matches(Object value) + { + return value.getClass().getCanonicalName().equals(name); + } + + @Override + public String toString() + { + return name; + } + }; + } + + public static Condition isThrowable(Class klass) + { + // org.assertj.core.api.AbstractAssert.is has which blocks , so need to + // always return Throwable + return (Condition) (Condition) is(klass); + } + + /** + * When working with jvm-dtest the thrown error is in a different {@link ClassLoader} causing type checks + * to fail; this method relies on naming instead. + * + * This method is different than {@link #is(Class)} as it tries to mimic instanceOf rather than equality. + */ + public static Condition isInstanceof(Class klass) + { + String name = klass.getCanonicalName(); + return new Condition() { + @Override + public boolean matches(Object value) + { + if (value == null) + return false; + return matches(value.getClass()); + } + + private boolean matches(Class input) + { + for (Class klass = input; klass != null; klass = klass.getSuperclass()) + { + // extends + if (klass.getCanonicalName().equals(name)) + return true; + // implements + for (Class i : klass.getInterfaces()) + { + if (matches(i)) + return true; + } + } + return false; + } + + @Override + public String toString() + { + return name; + } + }; + } + + public static Condition rootCause(Condition other) + { + return new Condition() { + @Override + public boolean matches(Throwable value) + { + return other.matches(Throwables.getRootCause(value)); + } + + @Override + public String toString() + { + return "Root cause " + other; + } + }; + } + + public static Condition rootCauseIs(Class klass) + { + return rootCause((Condition) (Condition) is(klass)); + } +} diff --git a/test/unit/org/apache/cassandra/utils/AssertionUtilsTest.java b/test/unit/org/apache/cassandra/utils/AssertionUtilsTest.java new file mode 100644 index 000000000000..e3ec93ab486b --- /dev/null +++ b/test/unit/org/apache/cassandra/utils/AssertionUtilsTest.java @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.utils; + +import org.junit.Test; + +import org.assertj.core.api.Assertions; + +public class AssertionUtilsTest +{ + @Test + public void isInstanceof() + { + Assertions.assertThat(new C()) + .is(AssertionUtils.isInstanceof(A.class)); + + Assertions.assertThat(new D()) + .is(AssertionUtils.isInstanceof(A.class)) + .is(AssertionUtils.isInstanceof(B.class)); + + Assertions.assertThat(null instanceof A) + .isEqualTo(AssertionUtils.isInstanceof(A.class).matches(null)); + } + + interface A {} + interface B extends A {} + static class C implements A {} + static class D implements B {} +} \ No newline at end of file From e7c9ac05f99cc8a5ee958169c49326e85ab4b25b Mon Sep 17 00:00:00 2001 From: Stefan Miklosovic Date: Fri, 19 Aug 2022 16:50:49 +0200 Subject: [PATCH 051/159] Introduce target directory to vtable output for sstable_tasks and for compactionstats patch by Stefan Miklosovic; reviewed by Brandon Williams for CASSANDRA-13010 Co-authored-by: Alex Lourie --- CHANGES.txt | 1 + .../cassandra/cache/AutoSavingCache.java | 3 +- .../db/compaction/CompactionInfo.java | 52 +++++-- .../db/compaction/CompactionIterator.java | 12 +- .../db/compaction/CompactionManager.java | 4 + .../db/compaction/CompactionTask.java | 1 + .../cassandra/db/compaction/Scrubber.java | 4 +- .../cassandra/db/compaction/Upgrader.java | 1 + .../writers/CompactionAwareWriter.java | 11 +- .../writers/DefaultCompactionWriter.java | 2 + .../writers/MajorLeveledCompactionWriter.java | 3 +- .../writers/MaxSSTableSizeWriter.java | 1 - .../SplittingSizeTieredCompactionWriter.java | 6 +- .../db/virtual/SSTableTasksTable.java | 5 +- .../index/sasi/SASIIndexBuilder.java | 7 +- .../tools/nodetool/CompactionStats.java | 9 +- .../db/lifecycle/RealTransactionsTest.java | 1 + .../db/virtual/SSTableTasksTableTest.java | 7 +- .../tools/nodetool/CompactionStatsTest.java | 146 +++++++++++------- 19 files changed, 193 insertions(+), 83 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 3fd1a8c747aa..43b68541db6a 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Introduce target directory to vtable output for sstable_tasks and for compactionstats (CASSANDRA-13010) * Read/Write/Truncate throw RequestFailure in a race condition with callback timeouts, should return Timeout instead (CASSANDRA-17828) * Add ability to log load profiles at fixed intervals (CASSANDRA-17821) * Protect against Gossip backing up due to a quarantined endpoint without version information (CASSANDRA-17830) diff --git a/src/java/org/apache/cassandra/cache/AutoSavingCache.java b/src/java/org/apache/cassandra/cache/AutoSavingCache.java index 1f383ec564b6..f2e59bafbec1 100644 --- a/src/java/org/apache/cassandra/cache/AutoSavingCache.java +++ b/src/java/org/apache/cassandra/cache/AutoSavingCache.java @@ -313,7 +313,8 @@ else if (cacheType == CacheService.CacheType.COUNTER_CACHE) 0, keysEstimate, Unit.KEYS, - nextTimeUUID()); + nextTimeUUID(), + getCacheDataPath(CURRENT_VERSION).toPath().toString()); } public CacheService.CacheType cacheType() diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionInfo.java b/src/java/org/apache/cassandra/db/compaction/CompactionInfo.java index 513adfa489db..00f583dd24c9 100644 --- a/src/java/org/apache/cassandra/db/compaction/CompactionInfo.java +++ b/src/java/org/apache/cassandra/db/compaction/CompactionInfo.java @@ -28,6 +28,7 @@ import com.google.common.collect.ImmutableSet; import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.io.util.File; import org.apache.cassandra.schema.TableMetadata; import org.apache.cassandra.utils.TimeUUID; @@ -42,6 +43,7 @@ public final class CompactionInfo public static final String UNIT = "unit"; public static final String COMPACTION_ID = "compactionId"; public static final String SSTABLES = "sstables"; + public static final String TARGET_DIRECTORY = "targetDirectory"; private final TableMetadata metadata; private final OperationType tasktype; @@ -50,13 +52,9 @@ public final class CompactionInfo private final Unit unit; private final TimeUUID compactionId; private final ImmutableSet sstables; + private final String targetDirectory; - public CompactionInfo(TableMetadata metadata, OperationType tasktype, long bytesComplete, long totalBytes, TimeUUID compactionId, Collection sstables) - { - this(metadata, tasktype, bytesComplete, totalBytes, Unit.BYTES, compactionId, sstables); - } - - private CompactionInfo(TableMetadata metadata, OperationType tasktype, long completed, long total, Unit unit, TimeUUID compactionId, Collection sstables) + public CompactionInfo(TableMetadata metadata, OperationType tasktype, long completed, long total, Unit unit, TimeUUID compactionId, Collection sstables, String targetDirectory) { this.tasktype = tasktype; this.completed = completed; @@ -65,21 +63,41 @@ private CompactionInfo(TableMetadata metadata, OperationType tasktype, long comp this.unit = unit; this.compactionId = compactionId; this.sstables = ImmutableSet.copyOf(sstables); + this.targetDirectory = targetDirectory; + } + + public CompactionInfo(TableMetadata metadata, OperationType tasktype, long completed, long total, TimeUUID compactionId, Collection sstables, String targetDirectory) + { + this(metadata, tasktype, completed, total, Unit.BYTES, compactionId, sstables, targetDirectory); + } + + public CompactionInfo(TableMetadata metadata, OperationType tasktype, long completed, long total, TimeUUID compactionId, Collection sstables) + { + this(metadata, tasktype, completed, total, Unit.BYTES, compactionId, sstables, null); } /** - * Special compaction info where we always need to cancel the compaction - for example ViewBuilderTask and AutoSavingCache where we don't know + * Special compaction info where we always need to cancel the compaction - for example ViewBuilderTask where we don't know * the sstables at construction */ public static CompactionInfo withoutSSTables(TableMetadata metadata, OperationType tasktype, long completed, long total, Unit unit, TimeUUID compactionId) { - return new CompactionInfo(metadata, tasktype, completed, total, unit, compactionId, ImmutableSet.of()); + return withoutSSTables(metadata, tasktype, completed, total, unit, compactionId, null); + } + + /** + * Special compaction info where we always need to cancel the compaction - for example AutoSavingCache where we don't know + * the sstables at construction + */ + public static CompactionInfo withoutSSTables(TableMetadata metadata, OperationType tasktype, long completed, long total, Unit unit, TimeUUID compactionId, String targetDirectory) + { + return new CompactionInfo(metadata, tasktype, completed, total, unit, compactionId, ImmutableSet.of(), targetDirectory); } /** @return A copy of this CompactionInfo with updated progress. */ public CompactionInfo forProgress(long complete, long total) { - return new CompactionInfo(metadata, tasktype, complete, total, unit, compactionId, sstables); + return new CompactionInfo(metadata, tasktype, complete, total, unit, compactionId, sstables, targetDirectory); } public Optional getKeyspace() @@ -127,6 +145,21 @@ public Set getSSTables() return sstables; } + public String targetDirectory() + { + if (targetDirectory == null) + return ""; + + try + { + return new File(targetDirectory).canonicalPath(); + } + catch (Throwable t) + { + throw new RuntimeException("Unable to resolve canonical path for " + targetDirectory); + } + } + @Override public String toString() { @@ -155,6 +188,7 @@ public Map asMap() ret.put(UNIT, unit.toString()); ret.put(COMPACTION_ID, compactionId == null ? "" : compactionId.toString()); ret.put(SSTABLES, Joiner.on(',').join(sstables)); + ret.put(TARGET_DIRECTORY, targetDirectory()); return ret; } diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java b/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java index a0dc0875d775..2f79f92b84e8 100644 --- a/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java +++ b/src/java/org/apache/cassandra/db/compaction/CompactionIterator.java @@ -78,11 +78,13 @@ public class CompactionIterator extends CompactionInfo.Holder implements Unfilte private final ImmutableSet sstables; private final int nowInSec; private final TimeUUID compactionId; - private final long totalBytes; private long bytesRead; private long totalSourceCQLRows; + // Keep targetDirectory for compactions, needed for `nodetool compactionstats` + private String targetDirectory; + /* * counters for merged rows. * array index represents (number of merged rows - 1), so index 0 is counter for no merge (1 row), @@ -151,7 +153,8 @@ public CompactionInfo getCompactionInfo() bytesRead, totalBytes, compactionId, - sstables); + sstables, + targetDirectory); } public boolean isGlobal() @@ -159,6 +162,11 @@ public boolean isGlobal() return false; } + public void setTargetDirectory(final String targetDirectory) + { + this.targetDirectory = targetDirectory; + } + private void updateCounterFor(int rows) { assert rows > 0 && rows - 1 < mergeCounters.length; diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java index 5c06ee275c1b..c843adfda0fc 100644 --- a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java +++ b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java @@ -1355,6 +1355,7 @@ private void doCleanupOne(final ColumnFamilyStore cfs, while (ci.hasNext()) { + ci.setTargetDirectory(writer.currentWriter().getFilename()); try (UnfilteredRowIterator partition = ci.next(); UnfilteredRowIterator notCleaned = cleanupStrategy.cleanup(partition)) { @@ -1709,15 +1710,18 @@ public void close() {} if (fullChecker.test(token)) { fullWriter.append(partition); + ci.setTargetDirectory(fullWriter.currentWriter().getFilename()); } else if (transChecker.test(token)) { transWriter.append(partition); + ci.setTargetDirectory(transWriter.currentWriter().getFilename()); } else { // otherwise, append it to the unrepaired sstable unrepairedWriter.append(partition); + ci.setTargetDirectory(unrepairedWriter.currentWriter().getFilename()); } long bytesScanned = scanners.getTotalBytesScanned(); compactionRateLimiterAcquire(limiter, bytesScanned, lastBytesScanned, compressionRatio); diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionTask.java b/src/java/org/apache/cassandra/db/compaction/CompactionTask.java index dc08f5ae01ed..a2c5a77a8550 100644 --- a/src/java/org/apache/cassandra/db/compaction/CompactionTask.java +++ b/src/java/org/apache/cassandra/db/compaction/CompactionTask.java @@ -199,6 +199,7 @@ public boolean apply(SSTableReader sstable) if (writer.append(ci.next())) totalKeysWritten++; + ci.setTargetDirectory(writer.getSStableDirectory().path()); long bytesScanned = scanners.getTotalBytesScanned(); // Rate limit the scanners, and account for compression diff --git a/src/java/org/apache/cassandra/db/compaction/Scrubber.java b/src/java/org/apache/cassandra/db/compaction/Scrubber.java index c8518ce5e875..ee877c6782b6 100644 --- a/src/java/org/apache/cassandra/db/compaction/Scrubber.java +++ b/src/java/org/apache/cassandra/db/compaction/Scrubber.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.nio.ByteBuffer; +import java.nio.file.Paths; import java.util.*; import java.util.concurrent.locks.Lock; import java.util.concurrent.locks.ReadWriteLock; @@ -512,7 +513,8 @@ public CompactionInfo getCompactionInfo() dataFile.getFilePointer(), dataFile.length(), scrubCompactionId, - ImmutableSet.of(sstable)); + ImmutableSet.of(sstable), + Paths.get(sstable.getFilename()).getParent().toString()); } catch (Exception e) { diff --git a/src/java/org/apache/cassandra/db/compaction/Upgrader.java b/src/java/org/apache/cassandra/db/compaction/Upgrader.java index 87bf5b811c2d..d1a367711835 100644 --- a/src/java/org/apache/cassandra/db/compaction/Upgrader.java +++ b/src/java/org/apache/cassandra/db/compaction/Upgrader.java @@ -94,6 +94,7 @@ public void upgrade(boolean keepOriginals) { writer.switchWriter(createCompactionWriter(sstable.getSSTableMetadata())); while (iter.hasNext()) + iter.setTargetDirectory(writer.currentWriter().getFilename()); writer.append(iter.next()); writer.finish(); diff --git a/src/java/org/apache/cassandra/db/compaction/writers/CompactionAwareWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/CompactionAwareWriter.java index 02c7b6ce448c..c375c85650c3 100644 --- a/src/java/org/apache/cassandra/db/compaction/writers/CompactionAwareWriter.java +++ b/src/java/org/apache/cassandra/db/compaction/writers/CompactionAwareWriter.java @@ -18,6 +18,7 @@ package org.apache.cassandra.db.compaction.writers; +import java.io.IOException; import java.util.Collection; import java.util.List; import java.util.Set; @@ -39,7 +40,7 @@ import org.apache.cassandra.utils.FBUtilities; import org.apache.cassandra.utils.TimeUUID; import org.apache.cassandra.utils.concurrent.Transactional; -import org.apache.cassandra.db.compaction.OperationType; +import org.apache.cassandra.io.util.File; /** @@ -65,6 +66,9 @@ public abstract class CompactionAwareWriter extends Transactional.AbstractTransa private final List diskBoundaries; private int locationIndex; + // Keep targetDirectory for compactions, needed for `nodetool compactionstats` + protected Directories.DataDirectory sstableDirectory; + public CompactionAwareWriter(ColumnFamilyStore cfs, Directories directories, LifecycleTransaction txn, @@ -136,6 +140,11 @@ public final boolean append(UnfilteredRowIterator partition) return realAppend(partition); } + public final File getSStableDirectory() throws IOException + { + return getDirectories().getLocationForDisk(sstableDirectory); + } + @Override protected Throwable doPostCleanup(Throwable accumulate) { diff --git a/src/java/org/apache/cassandra/db/compaction/writers/DefaultCompactionWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/DefaultCompactionWriter.java index 6180f96100b5..dfd771afdd85 100644 --- a/src/java/org/apache/cassandra/db/compaction/writers/DefaultCompactionWriter.java +++ b/src/java/org/apache/cassandra/db/compaction/writers/DefaultCompactionWriter.java @@ -67,6 +67,8 @@ public boolean realAppend(UnfilteredRowIterator partition) @Override public void switchCompactionLocation(Directories.DataDirectory directory) { + sstableDirectory = directory; + @SuppressWarnings("resource") SSTableWriter writer = SSTableWriter.create(cfs.newSSTableDescriptor(getDirectories().getLocationForDisk(directory)), estimatedTotalKeys, diff --git a/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java index b7fb88131c23..21f698ad9c76 100644 --- a/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java +++ b/src/java/org/apache/cassandra/db/compaction/writers/MajorLeveledCompactionWriter.java @@ -39,7 +39,6 @@ public class MajorLeveledCompactionWriter extends CompactionAwareWriter private long totalWrittenInLevel = 0; private int sstablesWritten = 0; private final long keysPerSSTable; - private Directories.DataDirectory sstableDirectory; private final int levelFanoutSize; public MajorLeveledCompactionWriter(ColumnFamilyStore cfs, @@ -90,7 +89,7 @@ public boolean realAppend(UnfilteredRowIterator partition) @Override public void switchCompactionLocation(Directories.DataDirectory location) { - this.sstableDirectory = location; + sstableDirectory = location; averageEstimatedKeysPerSSTable = Math.round(((double) averageEstimatedKeysPerSSTable * sstablesWritten + partitionsWritten) / (sstablesWritten + 1)); sstableWriter.switchWriter(SSTableWriter.create(cfs.newSSTableDescriptor(getDirectories().getLocationForDisk(sstableDirectory)), keysPerSSTable, diff --git a/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java index df918bc82d76..bd4aee0473ad 100644 --- a/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java +++ b/src/java/org/apache/cassandra/db/compaction/writers/MaxSSTableSizeWriter.java @@ -36,7 +36,6 @@ public class MaxSSTableSizeWriter extends CompactionAwareWriter private final int level; private final long estimatedSSTables; private final Set allSSTables; - private Directories.DataDirectory sstableDirectory; public MaxSSTableSizeWriter(ColumnFamilyStore cfs, Directories directories, diff --git a/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java b/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java index 264e19c90b9e..5d3f1a6e0b6f 100644 --- a/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java +++ b/src/java/org/apache/cassandra/db/compaction/writers/SplittingSizeTieredCompactionWriter.java @@ -49,7 +49,6 @@ public class SplittingSizeTieredCompactionWriter extends CompactionAwareWriter private final Set allSSTables; private long currentBytesToWrite; private int currentRatioIndex = 0; - private Directories.DataDirectory location; public SplittingSizeTieredCompactionWriter(ColumnFamilyStore cfs, Directories directories, LifecycleTransaction txn, Set nonExpiredSSTables) { @@ -90,8 +89,9 @@ public boolean realAppend(UnfilteredRowIterator partition) if (sstableWriter.currentWriter().getEstimatedOnDiskBytesWritten() > currentBytesToWrite && currentRatioIndex < ratios.length - 1) // if we underestimate how many keys we have, the last sstable might get more than we expect { currentRatioIndex++; + currentBytesToWrite = getExpectedWriteSize(); - switchCompactionLocation(location); + switchCompactionLocation(sstableDirectory); logger.debug("Switching writer, currentBytesToWrite = {}", currentBytesToWrite); } return rie != null; @@ -100,7 +100,7 @@ public boolean realAppend(UnfilteredRowIterator partition) @Override public void switchCompactionLocation(Directories.DataDirectory location) { - this.location = location; + sstableDirectory = location; long currentPartitionsToWrite = Math.round(ratios[currentRatioIndex] * estimatedTotalKeys); @SuppressWarnings("resource") SSTableWriter writer = SSTableWriter.create(cfs.newSSTableDescriptor(getDirectories().getLocationForDisk(location)), diff --git a/src/java/org/apache/cassandra/db/virtual/SSTableTasksTable.java b/src/java/org/apache/cassandra/db/virtual/SSTableTasksTable.java index 488b5804948e..e2f38f8e9201 100644 --- a/src/java/org/apache/cassandra/db/virtual/SSTableTasksTable.java +++ b/src/java/org/apache/cassandra/db/virtual/SSTableTasksTable.java @@ -38,6 +38,7 @@ final class SSTableTasksTable extends AbstractVirtualTable private final static String SSTABLES = "sstables"; private final static String TOTAL = "total"; private final static String UNIT = "unit"; + private final static String TARGET_DIRECTORY = "target_directory"; SSTableTasksTable(String keyspace) { @@ -54,6 +55,7 @@ final class SSTableTasksTable extends AbstractVirtualTable .addRegularColumn(SSTABLES, Int32Type.instance) .addRegularColumn(TOTAL, LongType.instance) .addRegularColumn(UNIT, UTF8Type.instance) + .addRegularColumn(TARGET_DIRECTORY, UTF8Type.instance) .build()); } @@ -76,7 +78,8 @@ public DataSet data() .column(PROGRESS, completed) .column(SSTABLES, task.getSSTables().size()) .column(TOTAL, total) - .column(UNIT, task.getUnit().toString().toLowerCase()); + .column(UNIT, task.getUnit().toString().toLowerCase()) + .column(TARGET_DIRECTORY, task.targetDirectory()); } return result; diff --git a/src/java/org/apache/cassandra/index/sasi/SASIIndexBuilder.java b/src/java/org/apache/cassandra/index/sasi/SASIIndexBuilder.java index 57a3f5193aca..a99b9f6e3a94 100644 --- a/src/java/org/apache/cassandra/index/sasi/SASIIndexBuilder.java +++ b/src/java/org/apache/cassandra/index/sasi/SASIIndexBuilder.java @@ -51,6 +51,9 @@ class SASIIndexBuilder extends SecondaryIndexBuilder private final ColumnFamilyStore cfs; private final TimeUUID compactionId = nextTimeUUID(); + // Keep targetDirectory for compactions, needed for `nodetool compactionstats` + private String targetDirectory; + private final SortedMap> sstables; private long bytesProcessed = 0; @@ -78,6 +81,7 @@ public void build() try (RandomAccessReader dataFile = sstable.openDataReader()) { PerSSTableIndexWriter indexWriter = SASIIndex.newWriter(keyValidator, sstable.descriptor, indexes, OperationType.COMPACTION); + targetDirectory = indexWriter.getDescriptor().directory.path(); long previousKeyPosition = 0; try (KeyIterator keys = new KeyIterator(sstable.descriptor, cfs.metadata())) @@ -130,7 +134,8 @@ public CompactionInfo getCompactionInfo() bytesProcessed, totalSizeInBytes, compactionId, - sstables.keySet()); + sstables.keySet(), + targetDirectory); } private long getPrimaryIndexLength(SSTable sstable) diff --git a/src/java/org/apache/cassandra/tools/nodetool/CompactionStats.java b/src/java/org/apache/cassandra/tools/nodetool/CompactionStats.java index 799ef5668b2f..aedc8f225f87 100644 --- a/src/java/org/apache/cassandra/tools/nodetool/CompactionStats.java +++ b/src/java/org/apache/cassandra/tools/nodetool/CompactionStats.java @@ -91,7 +91,7 @@ public static void reportCompactionTable(List> compactions, l TableBuilder table = new TableBuilder(); if (vtableOutput) - table.add("keyspace", "table", "task id", "completion ratio", "kind", "progress", "sstables", "total", "unit"); + table.add("keyspace", "table", "task id", "completion ratio", "kind", "progress", "sstables", "total", "unit", "target directory"); else table.add("id", "compaction type", "keyspace", "table", "completed", "total", "unit", "progress"); @@ -110,7 +110,10 @@ public static void reportCompactionTable(List> compactions, l String percentComplete = total == 0 ? "n/a" : new DecimalFormat("0.00").format((double) completed / total * 100) + "%"; String id = c.get(CompactionInfo.COMPACTION_ID); if (vtableOutput) - table.add(keyspace, columnFamily, id, percentComplete, taskType, progressStr, String.valueOf(tables.length), totalStr, unit); + { + String targetDirectory = c.get(CompactionInfo.TARGET_DIRECTORY); + table.add(keyspace, columnFamily, id, percentComplete, taskType, progressStr, String.valueOf(tables.length), totalStr, unit, targetDirectory); + } else table.add(id, taskType, keyspace, columnFamily, progressStr, totalStr, unit, percentComplete); @@ -128,4 +131,4 @@ public static void reportCompactionTable(List> compactions, l } } -} +} \ No newline at end of file diff --git a/test/unit/org/apache/cassandra/db/lifecycle/RealTransactionsTest.java b/test/unit/org/apache/cassandra/db/lifecycle/RealTransactionsTest.java index 0420957b9dbd..71803887a55f 100644 --- a/test/unit/org/apache/cassandra/db/lifecycle/RealTransactionsTest.java +++ b/test/unit/org/apache/cassandra/db/lifecycle/RealTransactionsTest.java @@ -172,6 +172,7 @@ private SSTableReader replaceSSTable(ColumnFamilyStore cfs, LifecycleTransaction txn)); while (ci.hasNext()) { + ci.setTargetDirectory(rewriter.currentWriter().getFilename()); rewriter.append(ci.next()); if (nanoTime() - lastCheckObsoletion > TimeUnit.MINUTES.toNanos(1L)) diff --git a/test/unit/org/apache/cassandra/db/virtual/SSTableTasksTableTest.java b/test/unit/org/apache/cassandra/db/virtual/SSTableTasksTableTest.java index 356ae416dc43..17636d85bb8c 100644 --- a/test/unit/org/apache/cassandra/db/virtual/SSTableTasksTableTest.java +++ b/test/unit/org/apache/cassandra/db/virtual/SSTableTasksTableTest.java @@ -74,11 +74,14 @@ public void testSelectAll() throws Throwable List sstables = IntStream.range(0, 10) .mapToObj(i -> MockSchema.sstable(i, i * 10L, i * 10L + 9, cfs)) .collect(Collectors.toList()); + + String directory = String.format("/some/datadir/%s/%s-%s", cfs.metadata.keyspace, cfs.metadata.name, cfs.metadata.id.asUUID()); + CompactionInfo.Holder compactionHolder = new CompactionInfo.Holder() { public CompactionInfo getCompactionInfo() { - return new CompactionInfo(cfs.metadata(), OperationType.COMPACTION, bytesCompacted, bytesTotal, compactionId, sstables); + return new CompactionInfo(cfs.metadata(), OperationType.COMPACTION, bytesCompacted, bytesTotal, compactionId, sstables, directory); } public boolean isGlobal() @@ -91,7 +94,7 @@ public boolean isGlobal() UntypedResultSet result = execute("SELECT * FROM vts.sstable_tasks"); assertRows(result, row(CQLTester.KEYSPACE, currentTable(), compactionId, 1.0 * bytesCompacted / bytesTotal, OperationType.COMPACTION.toString().toLowerCase(), bytesCompacted, sstables.size(), - bytesTotal, CompactionInfo.Unit.BYTES.toString())); + directory, bytesTotal, CompactionInfo.Unit.BYTES.toString())); CompactionManager.instance.active.finishCompaction(compactionHolder); result = execute("SELECT * FROM vts.sstable_tasks"); diff --git a/test/unit/org/apache/cassandra/tools/nodetool/CompactionStatsTest.java b/test/unit/org/apache/cassandra/tools/nodetool/CompactionStatsTest.java index 4ed1e000c5a5..a626daeae888 100644 --- a/test/unit/org/apache/cassandra/tools/nodetool/CompactionStatsTest.java +++ b/test/unit/org/apache/cassandra/tools/nodetool/CompactionStatsTest.java @@ -19,7 +19,7 @@ package org.apache.cassandra.tools.nodetool; import java.util.List; -import java.util.UUID; +import java.util.concurrent.atomic.AtomicReference; import java.util.stream.Collectors; import java.util.stream.IntStream; @@ -36,6 +36,7 @@ import org.apache.cassandra.tools.ToolRunner; import org.apache.cassandra.utils.TimeUUID; import org.assertj.core.api.Assertions; +import org.awaitility.Awaitility; import static org.apache.cassandra.utils.TimeUUID.Generator.nextTimeUUID; import static org.assertj.core.api.Assertions.assertThat; @@ -106,8 +107,8 @@ public void testCompactionStats() long bytesTotal = 123456; TimeUUID compactionId = nextTimeUUID(); List sstables = IntStream.range(0, 10) - .mapToObj(i -> MockSchema.sstable(i, i * 10L, i * 10L + 9, cfs)) - .collect(Collectors.toList()); + .mapToObj(i -> MockSchema.sstable(i, i * 10L, i * 10L + 9, cfs)) + .collect(Collectors.toList()); CompactionInfo.Holder compactionHolder = new CompactionInfo.Holder() { public CompactionInfo getCompactionInfo() @@ -122,21 +123,15 @@ public boolean isGlobal() }; CompactionManager.instance.active.beginCompaction(compactionHolder); - ToolRunner.ToolResult tool = ToolRunner.invokeNodetool("compactionstats"); - tool.assertOnCleanExit(); - String stdout = tool.getStdout(); - assertThat(stdout).contains("pending tasks: 1"); + String stdout = waitForNumberOfPendingTasks(1, "compactionstats"); Assertions.assertThat(stdout).containsPattern("id\\s+compaction type\\s+keyspace\\s+table\\s+completed\\s+total\\s+unit\\s+progress"); String expectedStatsPattern = String.format("%s\\s+%s\\s+%s\\s+%s\\s+%s\\s+%s\\s+%s\\s+%.2f%%", - compactionId, OperationType.COMPACTION, CQLTester.KEYSPACE, currentTable(), bytesCompacted, bytesTotal, - CompactionInfo.Unit.BYTES, (double) bytesCompacted / bytesTotal * 100); + compactionId, OperationType.COMPACTION, CQLTester.KEYSPACE, currentTable(), bytesCompacted, bytesTotal, + CompactionInfo.Unit.BYTES, (double) bytesCompacted / bytesTotal * 100); Assertions.assertThat(stdout).containsPattern(expectedStatsPattern); CompactionManager.instance.active.finishCompaction(compactionHolder); - tool = ToolRunner.invokeNodetool("compactionstats"); - tool.assertOnCleanExit(); - stdout = tool.getStdout(); - assertThat(stdout).contains("pending tasks: 0"); + waitForNumberOfPendingTasks(0, "compactionstats"); } @Test @@ -149,13 +144,27 @@ public void testCompactionStatsVtable() long bytesTotal = 123456; TimeUUID compactionId = nextTimeUUID(); List sstables = IntStream.range(0, 10) - .mapToObj(i -> MockSchema.sstable(i, i * 10L, i * 10L + 9, cfs)) - .collect(Collectors.toList()); + .mapToObj(i -> MockSchema.sstable(i, i * 10L, i * 10L + 9, cfs)) + .collect(Collectors.toList()); + String targetDirectory = "/some/dir/" + cfs.metadata.keyspace + '/' + cfs.metadata.name + '-' + cfs.metadata.id.asUUID(); CompactionInfo.Holder compactionHolder = new CompactionInfo.Holder() { public CompactionInfo getCompactionInfo() { - return new CompactionInfo(cfs.metadata(), OperationType.COMPACTION, bytesCompacted, bytesTotal, compactionId, sstables); + return new CompactionInfo(cfs.metadata(), OperationType.COMPACTION, bytesCompacted, bytesTotal, compactionId, sstables, targetDirectory); + } + + public boolean isGlobal() + { + return false; + } + }; + + CompactionInfo.Holder nonCompactionHolder = new CompactionInfo.Holder() + { + public CompactionInfo getCompactionInfo() + { + return new CompactionInfo(cfs.metadata(), OperationType.CLEANUP, bytesCompacted, bytesTotal, compactionId, sstables); } public boolean isGlobal() @@ -165,21 +174,23 @@ public boolean isGlobal() }; CompactionManager.instance.active.beginCompaction(compactionHolder); - ToolRunner.ToolResult tool = ToolRunner.invokeNodetool("compactionstats", "-V"); - tool.assertOnCleanExit(); - String stdout = tool.getStdout(); - assertThat(stdout).contains("pending tasks: 1"); - Assertions.assertThat(stdout).containsPattern("keyspace\\s+table\\s+task id\\s+completion ratio\\s+kind\\s+progress\\s+sstables\\s+total\\s+unit"); - String expectedStatsPattern = String.format("%s\\s+%s\\s+%s\\s+%.2f%%\\s+%s\\s+%s\\s+%s\\s+%s\\s+%s", - CQLTester.KEYSPACE, currentTable(), compactionId, (double) bytesCompacted / bytesTotal * 100, - OperationType.COMPACTION, bytesCompacted, sstables.size(), bytesTotal, CompactionInfo.Unit.BYTES); + CompactionManager.instance.active.beginCompaction(nonCompactionHolder); + String stdout = waitForNumberOfPendingTasks(2, "compactionstats", "-V"); + Assertions.assertThat(stdout).containsPattern("keyspace\\s+table\\s+task id\\s+completion ratio\\s+kind\\s+progress\\s+sstables\\s+total\\s+unit\\s+target directory"); + String expectedStatsPattern = String.format("%s\\s+%s\\s+%s\\s+%.2f%%\\s+%s\\s+%s\\s+%s\\s+%s\\s+%s\\s+%s", + CQLTester.KEYSPACE, currentTable(), compactionId, (double) bytesCompacted / bytesTotal * 100, + OperationType.COMPACTION, bytesCompacted, sstables.size(), bytesTotal, CompactionInfo.Unit.BYTES, + targetDirectory); Assertions.assertThat(stdout).containsPattern(expectedStatsPattern); + String expectedStatsPatternForNonCompaction = String.format("%s\\s+%s\\s+%s\\s+%.2f%%\\s+%s\\s+%s\\s+%s\\s+%s\\s+%s", + CQLTester.KEYSPACE, currentTable(), compactionId, (double) bytesCompacted / bytesTotal * 100, + OperationType.COMPACTION, bytesCompacted, sstables.size(), bytesTotal, CompactionInfo.Unit.BYTES); + Assertions.assertThat(stdout).containsPattern(expectedStatsPatternForNonCompaction); + CompactionManager.instance.active.finishCompaction(compactionHolder); - tool = ToolRunner.invokeNodetool("compactionstats", "-V"); - tool.assertOnCleanExit(); - stdout = tool.getStdout(); - assertThat(stdout).contains("pending tasks: 0"); + CompactionManager.instance.active.finishCompaction(nonCompactionHolder); + waitForNumberOfPendingTasks(0, "compactionstats", "-V"); } @Test @@ -192,8 +203,8 @@ public void testCompactionStatsHumanReadable() long bytesTotal = 123456; TimeUUID compactionId = nextTimeUUID(); List sstables = IntStream.range(0, 10) - .mapToObj(i -> MockSchema.sstable(i, i * 10L, i * 10L + 9, cfs)) - .collect(Collectors.toList()); + .mapToObj(i -> MockSchema.sstable(i, i * 10L, i * 10L + 9, cfs)) + .collect(Collectors.toList()); CompactionInfo.Holder compactionHolder = new CompactionInfo.Holder() { public CompactionInfo getCompactionInfo() @@ -208,21 +219,15 @@ public boolean isGlobal() }; CompactionManager.instance.active.beginCompaction(compactionHolder); - ToolRunner.ToolResult tool = ToolRunner.invokeNodetool("compactionstats", "--human-readable"); - tool.assertOnCleanExit(); - String stdout = tool.getStdout(); - assertThat(stdout).contains("pending tasks: 1"); + String stdout = waitForNumberOfPendingTasks(1, "compactionstats", "--human-readable"); Assertions.assertThat(stdout).containsPattern("id\\s+compaction type\\s+keyspace\\s+table\\s+completed\\s+total\\s+unit\\s+progress"); String expectedStatsPattern = String.format("%s\\s+%s\\s+%s\\s+%s\\s+%s\\s+%s\\s+%s\\s+%.2f%%", - compactionId, OperationType.COMPACTION, CQLTester.KEYSPACE, currentTable(), "123 bytes", "120.56 KiB", - CompactionInfo.Unit.BYTES, (double) bytesCompacted / bytesTotal * 100); + compactionId, OperationType.COMPACTION, CQLTester.KEYSPACE, currentTable(), "123 bytes", "120.56 KiB", + CompactionInfo.Unit.BYTES, (double) bytesCompacted / bytesTotal * 100); Assertions.assertThat(stdout).containsPattern(expectedStatsPattern); CompactionManager.instance.active.finishCompaction(compactionHolder); - tool = ToolRunner.invokeNodetool("compactionstats", "--human-readable"); - tool.assertOnCleanExit(); - stdout = tool.getStdout(); - assertThat(stdout).contains("pending tasks: 0"); + waitForNumberOfPendingTasks(0, "compactionstats", "--human-readable"); } @Test @@ -235,13 +240,27 @@ public void testCompactionStatsVtableHumanReadable() long bytesTotal = 123456; TimeUUID compactionId = nextTimeUUID(); List sstables = IntStream.range(0, 10) - .mapToObj(i -> MockSchema.sstable(i, i * 10L, i * 10L + 9, cfs)) - .collect(Collectors.toList()); + .mapToObj(i -> MockSchema.sstable(i, i * 10L, i * 10L + 9, cfs)) + .collect(Collectors.toList()); + String targetDirectory = "/some/dir/" + cfs.metadata.keyspace + '/' + cfs.metadata.name + '-' + cfs.metadata.id.asUUID(); CompactionInfo.Holder compactionHolder = new CompactionInfo.Holder() { public CompactionInfo getCompactionInfo() { - return new CompactionInfo(cfs.metadata(), OperationType.COMPACTION, bytesCompacted, bytesTotal, compactionId, sstables); + return new CompactionInfo(cfs.metadata(), OperationType.COMPACTION, bytesCompacted, bytesTotal, compactionId, sstables, targetDirectory); + } + + public boolean isGlobal() + { + return false; + } + }; + + CompactionInfo.Holder nonCompactionHolder = new CompactionInfo.Holder() + { + public CompactionInfo getCompactionInfo() + { + return new CompactionInfo(cfs.metadata(), OperationType.CLEANUP, bytesCompacted, bytesTotal, compactionId, sstables); } public boolean isGlobal() @@ -251,20 +270,35 @@ public boolean isGlobal() }; CompactionManager.instance.active.beginCompaction(compactionHolder); - ToolRunner.ToolResult tool = ToolRunner.invokeNodetool("compactionstats", "--vtable", "--human-readable"); - tool.assertOnCleanExit(); - String stdout = tool.getStdout(); - assertThat(stdout).contains("pending tasks: 1"); - Assertions.assertThat(stdout).containsPattern("keyspace\\s+table\\s+task id\\s+completion ratio\\s+kind\\s+progress\\s+sstables\\s+total\\s+unit"); - String expectedStatsPattern = String.format("%s\\s+%s\\s+%s\\s+%.2f%%\\s+%s\\s+%s\\s+%s\\s+%s\\s+%s", - CQLTester.KEYSPACE, currentTable(), compactionId, (double) bytesCompacted / bytesTotal * 100, - OperationType.COMPACTION, "123 bytes", sstables.size(), "120.56 KiB", CompactionInfo.Unit.BYTES); + CompactionManager.instance.active.beginCompaction(nonCompactionHolder); + String stdout = waitForNumberOfPendingTasks(2, "compactionstats", "--vtable", "--human-readable"); + Assertions.assertThat(stdout).containsPattern("keyspace\\s+table\\s+task id\\s+completion ratio\\s+kind\\s+progress\\s+sstables\\s+total\\s+unit\\s+target directory"); + String expectedStatsPattern = String.format("%s\\s+%s\\s+%s\\s+%.2f%%\\s+%s\\s+%s\\s+%s\\s+%s\\s+%s\\s+%s", + CQLTester.KEYSPACE, currentTable(), compactionId, (double) bytesCompacted / bytesTotal * 100, + OperationType.COMPACTION, "123 bytes", sstables.size(), "120.56 KiB", CompactionInfo.Unit.BYTES, + targetDirectory); Assertions.assertThat(stdout).containsPattern(expectedStatsPattern); + String expectedStatsPatternForNonCompaction = String.format("%s\\s+%s\\s+%s\\s+%.2f%%\\s+%s\\s+%s\\s+%s\\s+%s\\s+%s", + CQLTester.KEYSPACE, currentTable(), compactionId, (double) bytesCompacted / bytesTotal * 100, + OperationType.CLEANUP, "123 bytes", sstables.size(), "120.56 KiB", CompactionInfo.Unit.BYTES); + Assertions.assertThat(stdout).containsPattern(expectedStatsPatternForNonCompaction); CompactionManager.instance.active.finishCompaction(compactionHolder); - tool = ToolRunner.invokeNodetool("compactionstats", "--vtable", "--human-readable"); - tool.assertOnCleanExit(); - stdout = tool.getStdout(); - assertThat(stdout).contains("pending tasks: 0"); + CompactionManager.instance.active.finishCompaction(nonCompactionHolder); + waitForNumberOfPendingTasks(0, "compactionstats", "--vtable", "--human-readable"); + } + + private String waitForNumberOfPendingTasks(int pendingTasksToWaitFor, String... args) + { + AtomicReference stdout = new AtomicReference<>(); + Awaitility.await().until(() -> { + ToolRunner.ToolResult tool = ToolRunner.invokeNodetool(args); + tool.assertOnCleanExit(); + String output = tool.getStdout(); + stdout.set(output); + return output.contains("pending tasks: " + pendingTasksToWaitFor); + }); + + return stdout.get(); } -} +} \ No newline at end of file From 1c714e43e6bad82ca24e095385a24fe9b33dd4f4 Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Mon, 22 Aug 2022 14:16:30 -0400 Subject: [PATCH 052/159] Log anticompaction cancellation at INFO level Patch by Marcus Eriksson; reviewed by Caleb Rackliffe, David Capwell, and Josh McKenzie for CASSANDRA-17841 Co-authored-by: Marcus Eriksson Co-authored-by: Josh McKenzie --- .../cassandra/db/compaction/CompactionManager.java | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java index c843adfda0fc..5906ac294af5 100644 --- a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java +++ b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java @@ -1756,10 +1756,17 @@ else if (transChecker.test(token)) } catch (Throwable e) { - if (e instanceof CompactionInterruptedException && isCancelled.getAsBoolean()) + if (e instanceof CompactionInterruptedException) { - logger.info("Anticompaction has been canceled for session {}", pendingRepair); - logger.trace(e.getMessage(), e); + if (isCancelled.getAsBoolean()) + { + logger.info("Anticompaction has been canceled for session {}", pendingRepair); + logger.trace(e.getMessage(), e); + } + else + { + logger.info("Anticompaction for session {} has been stopped by request.", pendingRepair); + } } else { From 0e855c4b7c157b7ba63bb7377bc441260d76556f Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Mon, 22 Aug 2022 14:28:50 -0400 Subject: [PATCH 053/159] Fix potential out of range exception on column index downsampling Patch by Marcus Eriksson; reviewed by Josh McKenzie, Jon Meredith, and Caleb Rackliffe for CASSANDRA-17839 Co-authored-by: Marcus Eriksson Co-authored-by: Josh McKenzie --- CHANGES.txt | 1 + .../io/sstable/format/big/BigTableWriter.java | 14 +++++++++++++- .../io/sstable/metadata/MetadataCollector.java | 5 +++-- 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 43b68541db6a..84975ef1b932 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Fix potential out of range exception on column index downsampling (CASSANDRA-17839) * Introduce target directory to vtable output for sstable_tasks and for compactionstats (CASSANDRA-13010) * Read/Write/Truncate throw RequestFailure in a race condition with callback timeouts, should return Timeout instead (CASSANDRA-17828) * Add ability to log load profiles at fixed intervals (CASSANDRA-17821) diff --git a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java index e8dff32fbc9a..0adb9df2277c 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java +++ b/src/java/org/apache/cassandra/io/sstable/format/big/BigTableWriter.java @@ -356,7 +356,19 @@ public SSTableReader openEarly() ifile = iwriter.builder.bufferSize(indexBufferSize).complete(boundary.indexLength); if (compression) dbuilder.withCompressionMetadata(((CompressedSequentialWriter) dataFile).open(boundary.dataLength)); - int dataBufferSize = optimizationStrategy.bufferSize(stats.estimatedPartitionSize.percentile(DatabaseDescriptor.getDiskOptimizationEstimatePercentile())); + + EstimatedHistogram partitionSizeHistogram = stats.estimatedPartitionSize; + + if (partitionSizeHistogram.isOverflowed()) + { + logger.warn("Estimated partition size histogram for '{}' is overflowed ({} values greater than {}). " + + "Clearing the overflow bucket to allow for degraded mean and percentile calculations...", + descriptor, partitionSizeHistogram.overflowCount(), partitionSizeHistogram.getLargestBucketOffset()); + + partitionSizeHistogram.clearOverflow(); + } + + int dataBufferSize = optimizationStrategy.bufferSize(partitionSizeHistogram.percentile(DatabaseDescriptor.getDiskOptimizationEstimatePercentile())); dfile = dbuilder.bufferSize(dataBufferSize).complete(boundary.dataLength); invalidateCacheAtBoundary(dfile); sstable = SSTableReader.internalOpen(descriptor, diff --git a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java index 1375331ce567..4786a1cbbc37 100644 --- a/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java +++ b/src/java/org/apache/cassandra/io/sstable/metadata/MetadataCollector.java @@ -58,8 +58,9 @@ static EstimatedHistogram defaultCellPerPartitionCountHistogram() static EstimatedHistogram defaultPartitionSizeHistogram() { - // EH of 150 can track a max value of 1697806495183, i.e., > 1.5PB - return new EstimatedHistogram(150); + // EH of 155 can track a max value of 3520571548412 i.e. 3.5TB + return new EstimatedHistogram(155); + } static TombstoneHistogram defaultTombstoneDropTimeHistogram() From d0b9532f2b87a17a0508d0637556f2f3e8d0fd94 Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Mon, 22 Aug 2022 15:04:19 -0400 Subject: [PATCH 054/159] Add the ability for operators to loosen the definition of "empty" for edge cases Patch by David Capwell; reviewed by Josh McKenzie, Yifan Cai, and Sam Tunnicliffe for CASSANDRA-17842 Co-authored-by: David Capwell Co-authored-by: Josh McKenzie --- CHANGES.txt | 1 + NEWS.txt | 18 +++++++++++++----- .../config/CassandraRelevantProperties.java | 3 +++ .../apache/cassandra/gms/EndpointState.java | 15 ++++++++++++++- .../org/apache/cassandra/gms/Gossiper.java | 13 +++++++++++++ .../apache/cassandra/gms/GossiperMBean.java | 3 +++ 6 files changed, 47 insertions(+), 6 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 84975ef1b932..dee8a5e74176 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Add the ability for operators to loosen the definition of "empty" for edge cases (CASSANDRA-17842) * Fix potential out of range exception on column index downsampling (CASSANDRA-17839) * Introduce target directory to vtable output for sstable_tasks and for compactionstats (CASSANDRA-13010) * Read/Write/Truncate throw RequestFailure in a race condition with callback timeouts, should return Timeout instead (CASSANDRA-17828) diff --git a/NEWS.txt b/NEWS.txt index fe87f0cd7858..b488acbf201b 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -77,12 +77,20 @@ New features - It is possible to list ephemeral snapshots by nodetool listsnaphots command when flag "-e" is specified. - Added a new flag to `nodetool profileload` and JMX endpoint to set up recurring profile load generation on specified intervals (see CASSANDRA-17821) + - Added a new property, gossiper.loose_empty_enabled, to allow for a looser definition of "empty" when + considering the heartbeat state of another node in Gossip. This should only be used by knowledgeable + operators in the following scenarios: + + Currently "empty" w/regards to heartbeat state in Gossip is very specific to a single edge case (i.e. in + isEmptyWithoutStatus() our usage of hbState() + applicationState), however there are other failure cases which + block host replacements and require intrusive workarounds and human intervention to recover from when you + have something in hbState() you don't expect. See CASSANDRA-17842 for further details. Upgrading --------- - Ephemeral marker files for snapshots done by repairs are not created anymore, - there is a dedicated flag in snapshot manifest instead. On upgrade of a node to version 4.2, on node's start, in case there - are such ephemeral snapshots on disk, they will be deleted (same behaviour as before) and any new ephemeral snapshots + there is a dedicated flag in snapshot manifest instead. On upgrade of a node to version 4.2, on node's start, in case there + are such ephemeral snapshots on disk, they will be deleted (same behaviour as before) and any new ephemeral snapshots will stop to create ephemeral marker files as flag in a snapshot manifest was introduced instead. Deprecation @@ -427,7 +435,7 @@ Upgrading - Native protocol v5 is promoted from beta in this release. The wire format has changed significantly and users should take care to ensure client drivers are upgraded to a version with support for the final v5 format, if currently connecting over v5-beta. (CASSANDRA-15299, CASSANDRA-14973) - - Cassandra removed support for the OldNetworkTopologyStrategy. Before upgrading you will need to change the + - Cassandra removed support for the OldNetworkTopologyStrategy. Before upgrading you will need to change the replication strategy for the keyspaces using this strategy to the NetworkTopologyStrategy. (CASSANDRA-13990) - Sstables for tables using with a frozen UDT written by C* 3.0 appear as corrupted. @@ -625,7 +633,7 @@ Upgrading reason, a opt-in system property has been added to disable the fix: -Dcassandra.unsafe.disable-serial-reads-linearizability=true Use this flag at your own risk as it revert SERIAL reads to the incorrect behavior of - previous versions. See CASSANDRA-12126 for details. + previous versions. See CASSANDRA-12126 for details. - SASI's `max_compaction_flush_memory_in_mb` setting was previously getting interpreted in bytes. From 3.11.8 it is correctly interpreted in megabytes, but prior to 3.11.10 previous configurations of this setting will lead to nodes OOM during compaction. From 3.11.10 previous configurations will be detected as incorrect, @@ -722,7 +730,7 @@ Compact Storage (only when upgrading from 3.X or any version lower than 3.0.15) Starting version 5.0, COMPACT STORAGE will no longer be supported. 'ALTER ... DROP COMPACT STORAGE' statement makes Compact Tables CQL-compatible, exposing internal structure of Thrift/Compact Tables. You can find more details - on exposed internal structure under: + on exposed internal structure under: http://cassandra.apache.org/doc/latest/cql/appendices.html#appendix-c-dropping-compact-storage For uninterrupted cluster upgrades, drivers now support 'NO_COMPACT' startup option. diff --git a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java index 00c2f4cd28be..81f9a64418d0 100644 --- a/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java +++ b/src/java/org/apache/cassandra/config/CassandraRelevantProperties.java @@ -295,6 +295,9 @@ public enum CassandraRelevantProperties // for specific tests ORG_APACHE_CASSANDRA_CONF_CASSANDRA_RELEVANT_PROPERTIES_TEST("org.apache.cassandra.conf.CassandraRelevantPropertiesTest"), ORG_APACHE_CASSANDRA_DB_VIRTUAL_SYSTEM_PROPERTIES_TABLE_TEST("org.apache.cassandra.db.virtual.SystemPropertiesTableTest"), + + // Loosen the definition of "empty" for gossip state, for use during host replacements if things go awry + LOOSE_DEF_OF_EMPTY_ENABLED(Config.PROPERTY_PREFIX + "gossiper.loose_empty_enabled"); ; diff --git a/src/java/org/apache/cassandra/gms/EndpointState.java b/src/java/org/apache/cassandra/gms/EndpointState.java index c60a4793bd82..69684e4b6795 100644 --- a/src/java/org/apache/cassandra/gms/EndpointState.java +++ b/src/java/org/apache/cassandra/gms/EndpointState.java @@ -29,6 +29,7 @@ import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.cassandra.config.CassandraRelevantProperties; import org.apache.cassandra.db.TypeSizes; import org.apache.cassandra.io.IVersionedSerializer; import org.apache.cassandra.io.util.DataInputPlus; @@ -46,6 +47,8 @@ public class EndpointState { protected static final Logger logger = LoggerFactory.getLogger(EndpointState.class); + static volatile boolean LOOSE_DEF_OF_EMPTY_ENABLED = CassandraRelevantProperties.LOOSE_DEF_OF_EMPTY_ENABLED.getBoolean(); + public final static IVersionedSerializer serializer = new EndpointStateSerializer(); public final static IVersionedSerializer nullableSerializer = NullableSerializer.wrap(serializer); @@ -202,7 +205,17 @@ void markDead() public boolean isEmptyWithoutStatus() { Map state = applicationState.get(); - return hbState.isEmpty() && !(state.containsKey(ApplicationState.STATUS_WITH_PORT) || state.containsKey(ApplicationState.STATUS)); + boolean hasStatus = state.containsKey(ApplicationState.STATUS_WITH_PORT) || state.containsKey(ApplicationState.STATUS); + return hbState.isEmpty() && !hasStatus + // In the very specific case where hbState.isEmpty and STATUS is missing, this is known to be safe to "fake" + // the data, as this happens when the gossip state isn't coming from the node but instead from a peer who + // restarted and is missing the node's state. + // + // When hbState is not empty, then the node gossiped an empty STATUS; this happens during bootstrap and it's not + // possible to tell if this is ok or not (we can't really tell if the node is dead or having networking issues). + // For these cases allow an external actor to verify and inform Cassandra that it is safe - this is done by + // updating the LOOSE_DEF_OF_EMPTY_ENABLED field. + || (LOOSE_DEF_OF_EMPTY_ENABLED && !hasStatus); } public boolean isRpcReady() diff --git a/src/java/org/apache/cassandra/gms/Gossiper.java b/src/java/org/apache/cassandra/gms/Gossiper.java index 1d6a597c43c7..d0fab0cac50f 100644 --- a/src/java/org/apache/cassandra/gms/Gossiper.java +++ b/src/java/org/apache/cassandra/gms/Gossiper.java @@ -2513,6 +2513,19 @@ private CassandraVersion computeMinVersion() return minVersion; } + @Override + public boolean getLooseEmptyEnabled() + { + return EndpointState.LOOSE_DEF_OF_EMPTY_ENABLED; + } + + @Override + public void setLooseEmptyEnabled(boolean enabled) + { + logger.info("Setting loose definition of empty to {}", enabled); + EndpointState.LOOSE_DEF_OF_EMPTY_ENABLED = enabled; + } + public void unsafeSetEnabled() { scheduledGossipTask = new NotScheduledFuture<>(); diff --git a/src/java/org/apache/cassandra/gms/GossiperMBean.java b/src/java/org/apache/cassandra/gms/GossiperMBean.java index 47d7207ef86e..2d59e37f2d56 100644 --- a/src/java/org/apache/cassandra/gms/GossiperMBean.java +++ b/src/java/org/apache/cassandra/gms/GossiperMBean.java @@ -38,4 +38,7 @@ public interface GossiperMBean /** Returns each node's database release version */ public Map> getReleaseVersionsWithPort(); + public boolean getLooseEmptyEnabled(); + + public void setLooseEmptyEnabled(boolean enabled); } From 17810295ca3b05b011a0ff7061d27435b531ea32 Mon Sep 17 00:00:00 2001 From: Stefan Miklosovic Date: Tue, 23 Aug 2022 19:53:16 +0200 Subject: [PATCH 055/159] fix StandaloneUpgraderOnSStablesTest This is follow-up for CASSANDRA-13010 where a bug was introduced in Upgrader which made StandaloneUpgraderOnSStablesTest flaky. patch by Stefan Miklosovic; reviewed by Brandon Williams for CASSANDRA-17849 --- src/java/org/apache/cassandra/db/compaction/Upgrader.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/java/org/apache/cassandra/db/compaction/Upgrader.java b/src/java/org/apache/cassandra/db/compaction/Upgrader.java index d1a367711835..2bfb30b4e400 100644 --- a/src/java/org/apache/cassandra/db/compaction/Upgrader.java +++ b/src/java/org/apache/cassandra/db/compaction/Upgrader.java @@ -93,8 +93,8 @@ public void upgrade(boolean keepOriginals) CompactionIterator iter = new CompactionIterator(transaction.opType(), scanners.scanners, controller, nowInSec, nextTimeUUID())) { writer.switchWriter(createCompactionWriter(sstable.getSSTableMetadata())); + iter.setTargetDirectory(writer.currentWriter().getFilename()); while (iter.hasNext()) - iter.setTargetDirectory(writer.currentWriter().getFilename()); writer.append(iter.next()); writer.finish(); From dbadd08585e78fbfc0c2f26575fd6ce68e8d7ba2 Mon Sep 17 00:00:00 2001 From: Ekaterina Dimitrova Date: Thu, 18 Aug 2022 15:14:56 -0400 Subject: [PATCH 056/159] Update ASM(9.1 to 9.3), Mockito(1.10.10 to 1.12.13) and ByteBuddy(3.2.4 to 4.7.0) patch by Ekaterina Dimitrova; reviewed by Brandon Williams for CASSANDRA-17835 --- CHANGES.txt | 1 + build.xml | 6 +++--- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index dee8a5e74176..1375729e7614 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Update ASM(9.1 to 9.3), Mockito(1.10.10 to 1.12.13) and ByteBuddy(3.2.4 to 4.7.0) (CASSANDRA-17835) * Add the ability for operators to loosen the definition of "empty" for edge cases (CASSANDRA-17842) * Fix potential out of range exception on column index downsampling (CASSANDRA-17839) * Introduce target directory to vtable output for sstable_tasks and for compactionstats (CASSANDRA-13010) diff --git a/build.xml b/build.xml index 32b3fec87bce..ba9b81d4bc1d 100644 --- a/build.xml +++ b/build.xml @@ -134,9 +134,9 @@ - + - + @@ -550,7 +550,7 @@ - + From 1e27ffc6aded5b54b0207b114b154cffbc64dda5 Mon Sep 17 00:00:00 2001 From: Abe Ratnofsky Date: Wed, 24 Aug 2022 13:23:09 -0700 Subject: [PATCH 057/159] Remove dependency on Maven Ant Tasks patch by Abe Ratnofsky; reviewed by David Capwell, Michael Semb Wever for CASSANDRA-17750 --- .build/build-resolver.xml | 5 +- .build/cassandra-build-deps-template.xml | 127 +++ .build/cassandra-deps-template.xml | 344 ++++++++ .build/parent-pom-template.xml | 1029 ++++++++++++++++++++++ CHANGES.txt | 1 + build.xml | 492 +---------- 6 files changed, 1533 insertions(+), 465 deletions(-) create mode 100644 .build/cassandra-build-deps-template.xml create mode 100644 .build/cassandra-deps-template.xml create mode 100644 .build/parent-pom-template.xml diff --git a/.build/build-resolver.xml b/.build/build-resolver.xml index f17ad233969a..84311ade1715 100644 --- a/.build/build-resolver.xml +++ b/.build/build-resolver.xml @@ -175,7 +175,8 @@ - + + @@ -188,7 +189,7 @@ - + diff --git a/.build/cassandra-build-deps-template.xml b/.build/cassandra-build-deps-template.xml new file mode 100644 index 000000000000..727da9179a5c --- /dev/null +++ b/.build/cassandra-build-deps-template.xml @@ -0,0 +1,127 @@ + + + + 4.0.0 + + cassandra-parent + org.apache.cassandra + @version@ + @final.name@-parent.pom + + org.apache.cassandra + cassandra-build-deps + @version@ + + + junit + junit + + + commons-io + commons-io + + + org.mockito + mockito-core + + + org.ow2.asm + asm + + + org.ow2.asm + asm-tree + + + org.ow2.asm + asm-commons + + + org.ow2.asm + asm-util + + + com.google.jimfs + jimfs + + + com.puppycrawl.tools + checkstyle + + + org.quicktheories + quicktheories + + + org.reflections + reflections + + + com.google.code.java-allocation-instrumenter + java-allocation-instrumenter + + + org.apache.cassandra + dtest-api + + + org.openjdk.jmh + jmh-core + + + org.openjdk.jmh + jmh-generator-annprocess + + + net.ju-n.compile-command-annotations + compile-command-annotations + + + org.apache.ant + ant-junit + + + org.apache.cassandra + harry-core + + + org.junit + junit-bom + pom + + + org.awaitility + awaitility + + + org.hamcrest + hamcrest + + + org.jacoco + org.jacoco.agent + + + org.jacoco + org.jacoco.ant + + + com.fasterxml.jackson.dataformat + jackson-dataformat-yaml + + + diff --git a/.build/cassandra-deps-template.xml b/.build/cassandra-deps-template.xml new file mode 100644 index 000000000000..513fcbc5aa02 --- /dev/null +++ b/.build/cassandra-deps-template.xml @@ -0,0 +1,344 @@ + + + + 4.0.0 + + cassandra-parent + org.apache.cassandra + @version@ + @final.name@-parent.pom + + org.apache.cassandra + cassandra-all + @version@ + Apache Cassandra + The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model. + https://cassandra.apache.org + 2009 + + + The Apache Software License, Version 2.0 + https://www.apache.org/licenses/LICENSE-2.0.txt + + + + scm:https://gitbox.apache.org/repos/asf/cassandra.git + scm:https://gitbox.apache.org/repos/asf/cassandra.git + https://gitbox.apache.org/repos/asf?p=cassandra.git;a=tree + + + + org.xerial.snappy + snappy-java + + + org.lz4 + lz4-java + + + com.ning + compress-lzf + + + com.google.guava + guava + + + commons-cli + commons-cli + + + commons-codec + commons-codec + + + org.apache.commons + commons-lang3 + + + org.apache.commons + commons-math3 + + + org.antlr + antlr + + + org.antlr + ST4 + + + org.antlr + antlr-runtime + + + org.slf4j + slf4j-api + + + org.slf4j + log4j-over-slf4j + + + org.slf4j + jcl-over-slf4j + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-databind + + + com.fasterxml.jackson.core + jackson-annotations + + + com.fasterxml.jackson.datatype + jackson-datatype-jsr310 + + + com.googlecode.json-simple + json-simple + + + com.boundary + high-scale-lib + + + org.yaml + snakeyaml + + + org.mindrot + jbcrypt + + + io.airlift + airline + + + io.dropwizard.metrics + metrics-core + + + io.dropwizard.metrics + metrics-jvm + + + io.dropwizard.metrics + metrics-logback + + + com.addthis.metrics + reporter-config3 + + + com.clearspring.analytics + stream + + + ch.qos.logback + logback-core + + + ch.qos.logback + logback-classic + + + org.apache.hadoop + hadoop-core + + + org.apache.hadoop + hadoop-minicluster + + + com.datastax.cassandra + cassandra-driver-core + shaded + + + net.java.dev.jna + jna + + + com.github.jbellis + jamm + + + io.netty + netty-bom + pom + + + io.netty + netty-all + + + net.openhft + chronicle-queue + + + net.openhft + chronicle-core + + + net.openhft + chronicle-bytes + + + net.openhft + chronicle-wire + + + net.openhft + chronicle-threads + + + org.fusesource + sigar + + + org.eclipse.jdt.core.compiler + ecj + + + org.caffinitas.ohc + ohc-core + + + org.caffinitas.ohc + ohc-core-j8 + + + com.github.ben-manes.caffeine + caffeine + + + org.jctools + jctools-core + + + org.ow2.asm + asm + + + com.carrotsearch + hppc + + + org.gridkit.jvmtool + sjk-cli + 0.14 + + + org.gridkit.jvmtool + sjk-core + + + org.gridkit.jvmtool + sjk-stacktrace + + + org.gridkit.jvmtool + mxdump + + + org.gridkit.lab + jvm-attach-api + + + com.beust + jcommander + + + org.gridkit.jvmtool + sjk-json + + + com.github.luben + zstd-jni + + + org.psjava + psjava + + + io.netty + netty-tcnative-boringssl-static + + + javax.inject + javax.inject + + + com.google.j2objc + j2objc-annotations + + + org.hdrhistogram + HdrHistogram + + + de.jflex + jflex + + + com.github.rholder + snowball-stemmer + + + com.googlecode.concurrent-trees + concurrent-trees + + + com.google.code.findbugs + jsr305 + + + net.ju-n.compile-command-annotations + compile-command-annotations + + + org.assertj + assertj-core + + + org.jboss.byteman + byteman-install + + + org.jboss.byteman + byteman + + + org.jboss.byteman + byteman-submit + + + org.jboss.byteman + byteman-bmunit + + + com.github.seancfoley + ipaddress + + + diff --git a/.build/parent-pom-template.xml b/.build/parent-pom-template.xml new file mode 100644 index 000000000000..73a6431408b1 --- /dev/null +++ b/.build/parent-pom-template.xml @@ -0,0 +1,1029 @@ + + + + 4.0.0 + + apache + org.apache + 22 + + org.apache.cassandra + cassandra-parent + @version@ + pom + Apache Cassandra + The Apache Cassandra Project develops a highly scalable second-generation distributed database, bringing together Dynamo's fully distributed design and Bigtable's ColumnFamily-based data model. + https://cassandra.apache.org + 2009 + + + The Apache Software License, Version 2.0 + https://www.apache.org/licenses/LICENSE-2.0.txt + + + + 1.12.13 + 4.0.6 + 0.5.1 + + + @asm.version@ + @jamm.version@ + @allocation-instrumenter.version@ + @ecj.version@ + @jacoco.version@ + @jflex.version@ + + + + adelapena + Andres de la Peña + + + alakshman + Avinash Lakshman + + + aleksey + Aleksey Yeschenko + + + amorton + Aaron Morton + + + aweisberg + Ariel Weisberg + + + bdeggleston + Blake Eggleston + + + benedict + Benedict Elliott Smith + + + benjamin + Benjamin Lerer + + + blambov + Branimir Lambov + + + brandonwilliams + Brandon Williams + + + carl + Carl Yeksigian + + + dbrosius + David Brosiusd + + + dikang + Dikang Gu + + + eevans + Eric Evans + + + edimitrova + Ekaterina Dimitrova + + + gdusbabek + Gary Dusbabek + + + goffinet + Chris Goffinet + + + ifesdjeen + Alex Petrov + + + jaakko + Laine Jaakko Olavi + + + jake + T Jake Luciani + + + jasonbrown + Jason Brown + + + jbellis + Jonathan Ellis + + + jfarrell + Jake Farrell + + + jjirsa + Jeff Jirsa + + + jkni + Joel Knighton + + + jmckenzie + Josh McKenzie + + + johan + Johan Oskarsson + + + junrao + Jun Rao + + + jzhuang + Jay Zhuang + + + kohlisankalp + Sankalp Kohli + + + marcuse + Marcus Eriksson + + + mck + Michael Semb Wever + + + mishail + Mikhail Stepura + + + mshuler + Michael Shuler + + + paulo + Paulo Motta + + + pmalik + Prashant Malik + + + rstupp + Robert Stupp + + + scode + Peter Schuller + + + beobal + Sam Tunnicliffe + + + slebresne + Sylvain Lebresne + + + stefania + Stefania Alborghetti + + + tylerhobbs + Tyler Hobbs + + + vijay + Vijay Parthasarathy + + + xedin + Pavel Yaskevich + + + yukim + Yuki Morishita + + + zznate + Nate McCall + + + + scm:https://gitbox.apache.org/repos/asf/cassandra.git + scm:https://gitbox.apache.org/repos/asf/cassandra.git + https://gitbox.apache.org/repos/asf?p=cassandra.git;a=tree + + + + + + org.xerial.snappy + snappy-java + 1.1.8.4 + + + org.lz4 + lz4-java + 1.8.0 + + + com.ning + compress-lzf + 0.8.4 + provided + + + com.github.luben + zstd-jni + 1.5.0-4 + + + com.google.guava + guava + 27.0-jre + + + jsr305 + com.google.code.findbugs + + + animal-sniffer-annotations + org.codehaus.mojo + + + listenablefuture + com.google.guava + + + failureaccess + com.google.guava + + + checker-qual + org.checkerframework + + + error_prone_annotations + com.google.errorprone + + + + + com.google.jimfs + jimfs + 1.1 + + + org.hdrhistogram + HdrHistogram + 2.1.9 + + + commons-cli + commons-cli + 1.1 + + + commons-codec + commons-codec + 1.9 + + + commons-io + commons-io + 2.6 + + + org.apache.commons + commons-lang3 + 3.11 + + + org.apache.commons + commons-math3 + 3.2 + + + org.antlr + antlr + 3.5.2 + provided + + + stringtemplate + org.antlr + + + + + org.antlr + ST4 + 4.0.8 + + + org.antlr + antlr-runtime + 3.5.2 + + + stringtemplate + org.antlr + + + + + org.slf4j + slf4j-api + 1.7.25 + + + org.slf4j + log4j-over-slf4j + 1.7.25 + + + org.slf4j + jcl-over-slf4j + 1.7.25 + + + ch.qos.logback + logback-core + 1.2.9 + + + ch.qos.logback + logback-classic + 1.2.9 + + + com.fasterxml.jackson.core + jackson-core + 2.13.2 + + + com.fasterxml.jackson.core + jackson-databind + 2.13.2.2 + + + com.fasterxml.jackson.core + jackson-annotations + 2.13.2 + + + com.fasterxml.jackson.datatype + jackson-datatype-jsr310 + 2.13.2 + + + com.fasterxml.jackson.dataformat + jackson-dataformat-yaml + 2.13.2 + test + + + snakeyaml + org.yaml + + + + + com.googlecode.json-simple + json-simple + 1.1 + + + com.boundary + high-scale-lib + 1.0.6 + + + com.github.jbellis + jamm + ${jamm.version} + + + org.yaml + snakeyaml + 1.26 + + + junit + junit + 4.12 + test + + + hamcrest-core + org.hamcrest + + + + + org.mockito + mockito-core + 3.2.4 + test + + + org.quicktheories + quicktheories + 0.26 + test + + + com.google.code.java-allocation-instrumenter + java-allocation-instrumenter + ${allocation-instrumenter.version} + test + + + guava + com.google.guava + + + + + org.apache.cassandra + harry-core + 0.0.1 + test + + + org.reflections + reflections + 0.10.2 + test + + + org.apache.cassandra + dtest-api + 0.0.13 + test + + + com.puppycrawl.tools + checkstyle + 8.40 + test + + + org.apache.hadoop + hadoop-core + 1.0.3 + provided + + + servlet-api + org.mortbay.jetty + + + commons-logging + commons-logging + + + commons-lang + commons-lang + + + core + org.eclipse.jdt + + + ant + ant + + + junit + junit + + + jackson-mapper-asl + org.codehaus.jackson + + + slf4j-api + org.slf4j + + + + + org.apache.hadoop + hadoop-minicluster + 1.0.3 + provided + + + asm + asm + + + jackson-mapper-asl + org.codehaus.jackson + + + slf4j-api + org.slf4j + + + + + net.java.dev.jna + jna + 5.9.0 + + + org.jacoco + org.jacoco.agent + ${jacoco.version} + test + + + org.jacoco + org.jacoco.ant + ${jacoco.version} + test + + + asm + org.ow2.asm + + + + + org.jboss.byteman + byteman-install + ${byteman.version} + provided + + + org.jboss.byteman + byteman + ${byteman.version} + provided + + + org.jboss.byteman + byteman-submit + ${byteman.version} + provided + + + org.jboss.byteman + byteman-bmunit + ${byteman.version} + provided + + + net.bytebuddy + byte-buddy + ${bytebuddy.version} + + + net.bytebuddy + byte-buddy-agent + ${bytebuddy.version} + + + org.openjdk.jmh + jmh-core + 1.21 + test + + + org.openjdk.jmh + jmh-generator-annprocess + 1.21 + test + + + org.apache.ant + ant-junit + 1.10.12 + test + + + org.apache.cassandra + cassandra-all + 4.1-alpha2-SNAPSHOT + + + io.dropwizard.metrics + metrics-core + 3.1.5 + + + io.dropwizard.metrics + metrics-jvm + 3.1.5 + + + io.dropwizard.metrics + metrics-logback + 3.1.5 + + + com.addthis.metrics + reporter-config3 + 3.0.3 + + + hibernate-validator + org.hibernate + + + + + org.mindrot + jbcrypt + 0.4 + + + io.airlift + airline + 0.8 + + + jsr305 + com.google.code.findbugs + + + + + io.netty + netty-bom + 4.1.58.Final + pom + provided + + + io.netty + netty-all + 4.1.58.Final + + + io.netty + netty-tcnative-boringssl-static + 2.0.36.Final + + + net.openhft + chronicle-queue + 5.20.123 + + + tools + com.sun + + + + + net.openhft + chronicle-core + 2.20.126 + + + chronicle-analytics + net.openhft + + + annotations + org.jetbrains + + + + + net.openhft + chronicle-bytes + 2.20.111 + + + annotations + org.jetbrains + + + + + net.openhft + chronicle-wire + 2.20.117 + + + compiler + net.openhft + + + + + net.openhft + chronicle-threads + 2.20.111 + + + affinity + net.openhft + + + jna + net.java.dev.jna + + + jna-platform + net.java.dev.jna + + + + + com.google.code.findbugs + jsr305 + 2.0.2 + + + com.clearspring.analytics + stream + 2.5.2 + + + fastutil + it.unimi.dsi + + + + + com.datastax.cassandra + cassandra-driver-core + 3.11.0 + shaded + + + netty-buffer + io.netty + + + netty-codec + io.netty + + + netty-handler + io.netty + + + netty-transport + io.netty + + + slf4j-api + org.slf4j + + + jnr-ffi + com.github.jnr + + + jnr-posix + com.github.jnr + + + + + org.eclipse.jdt.core.compiler + ecj + ${ecj.version} + + + org.caffinitas.ohc + ohc-core + ${ohc.version} + + + slf4j-api + org.slf4j + + + + + org.caffinitas.ohc + ohc-core-j8 + ${ohc.version} + + + net.ju-n.compile-command-annotations + compile-command-annotations + 1.2.0 + provided + + + org.fusesource + sigar + 1.6.4 + + + log4j + log4j + + + + + com.carrotsearch + hppc + 0.8.1 + + + de.jflex + jflex + ${jflex.version} + + + ant + org.apache.ant + + + + + com.github.rholder + snowball-stemmer + 1.3.0.581.1 + + + com.googlecode.concurrent-trees + concurrent-trees + 2.4.0 + + + com.github.ben-manes.caffeine + caffeine + 2.9.2 + + + org.jctools + jctools-core + 3.1.0 + + + org.ow2.asm + asm + ${asm.version} + + + org.ow2.asm + asm-tree + ${asm.version} + test + + + org.ow2.asm + asm-commons + ${asm.version} + test + + + org.ow2.asm + asm-util + ${asm.version} + test + + + org.gridkit.jvmtool + sjk-cli + 0.14 + + + org.gridkit.jvmtool + sjk-core + 0.14 + + + sjk-hflame + org.gridkit.jvmtool + + + sjk-hflame + org.perfkit.sjk.parsers + + + sjk-jfr-standalone + org.perfkit.sjk.parsers + + + sjk-nps + org.perfkit.sjk.parsers + + + sjk-jfr5 + org.perfkit.sjk.parsers + + + sjk-jfr6 + org.perfkit.sjk.parsers + + + + + org.gridkit.jvmtool + sjk-stacktrace + 0.14 + + + org.gridkit.jvmtool + mxdump + 0.14 + + + org.gridkit.lab + jvm-attach-api + 1.5 + + + org.gridkit.jvmtool + sjk-json + 0.14 + + + com.beust + jcommander + 1.30 + + + org.psjava + psjava + 0.1.19 + + + javax.inject + javax.inject + 1 + + + com.google.j2objc + j2objc-annotations + 1.3 + + + org.junit + junit-bom + 5.6.0 + pom + test + + + org.assertj + assertj-core + 3.15.0 + provided + + + org.awaitility + awaitility + 4.0.3 + test + + + hamcrest + org.hamcrest + + + + + org.hamcrest + hamcrest + 2.2 + test + + + com.github.seancfoley + ipaddress + 5.3.3 + + + + diff --git a/CHANGES.txt b/CHANGES.txt index 76f26023911a..72b4e478fd94 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Remove dependency on Maven Ant Tasks (CASSANDRA-17750) * Update ASM(9.1 to 9.3), Mockito(1.10.10 to 1.12.13) and ByteBuddy(3.2.4 to 4.7.0) (CASSANDRA-17835) * Add the ability for operators to loosen the definition of "empty" for edge cases (CASSANDRA-17842) * Fix potential out of range exception on column index downsampling (CASSANDRA-17839) diff --git a/build.xml b/build.xml index ba9b81d4bc1d..525486d816e4 100644 --- a/build.xml +++ b/build.xml @@ -15,7 +15,6 @@ limitations under the License. --> @@ -94,11 +93,6 @@ - - - - @@ -124,37 +118,17 @@ + - - + - - - - - - - - - - - - - - - - - - - - @@ -278,7 +252,6 @@ - @@ -457,426 +430,6 @@ - - - - - - - - Downloading Maven ANT Tasks... - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - @@ -1094,15 +647,28 @@ - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + - + @@ -2277,7 +1843,7 @@ - + @@ -2287,7 +1853,7 @@ classpathref="checkstyle.lib.path"/> - + @@ -2303,7 +1869,7 @@ - + @@ -2321,7 +1887,7 @@ From ea44835bf19c97d2fc8da4e2672d8604d741893d Mon Sep 17 00:00:00 2001 From: Milan Krisko Date: Tue, 23 Aug 2022 22:05:46 -0400 Subject: [PATCH 058/159] Mark antora.yml in trunk as `prelease:true` patch by Milan Krisko; reviewed by Mick Semb Wever for CASSANDRA-17823 --- doc/antora.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/antora.yml b/doc/antora.yml index f9fa1480d186..401cbd43a9f4 100644 --- a/doc/antora.yml +++ b/doc/antora.yml @@ -1,6 +1,7 @@ name: Cassandra version: 'trunk' display_version: 'trunk' +prerelease: true asciidoc: attributes: cass_url: 'http://cassandra.apache.org/' From 1e2b60821327c158cba1c11d98eea68531178893 Mon Sep 17 00:00:00 2001 From: Stefan Miklosovic Date: Thu, 11 Aug 2022 16:39:04 +0200 Subject: [PATCH 059/159] Prevent a user from manually removing ephemeral snapshots patch by Stefan Miklosovic; reviewed by Paulo Motta for CASSANDRA-17757 --- CHANGES.txt | 1 + .../org/apache/cassandra/db/Keyspace.java | 14 ---- .../service/SnapshotVerbHandler.java | 2 +- .../cassandra/service/StorageService.java | 19 ++++- .../service/snapshot/SnapshotLoader.java | 20 ++++- .../service/snapshot/SnapshotManager.java | 2 +- .../service/snapshot/TableSnapshot.java | 16 ++++ .../test/EphemeralSnapshotTest.java | 74 ++++++++++++++++--- .../cassandra/db/SystemKeyspaceTest.java | 9 ++- .../service/snapshot/SnapshotLoaderTest.java | 34 +++++++++ 10 files changed, 153 insertions(+), 38 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 1e6cbcb0dd38..38da575202f3 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Prevent a user from manually removing ephemeral snapshots (CASSANDRA-17757) * Remove dependency on Maven Ant Tasks (CASSANDRA-17750) * Update ASM(9.1 to 9.3), Mockito(1.10.10 to 1.12.13) and ByteBuddy(3.2.4 to 4.7.0) (CASSANDRA-17835) * Add the ability for operators to loosen the definition of "empty" for edge cases (CASSANDRA-17842) diff --git a/src/java/org/apache/cassandra/db/Keyspace.java b/src/java/org/apache/cassandra/db/Keyspace.java index d6db700519b4..9dc4d4c189df 100644 --- a/src/java/org/apache/cassandra/db/Keyspace.java +++ b/src/java/org/apache/cassandra/db/Keyspace.java @@ -302,20 +302,6 @@ public boolean snapshotExists(String snapshotName) return false; } - /** - * Clear all the snapshots for a given keyspace. - * - * @param snapshotName the user supplied snapshot name. It empty or null, - * all the snapshots will be cleaned - */ - public static void clearSnapshot(String snapshotName, String keyspace) - { - RateLimiter clearSnapshotRateLimiter = DatabaseDescriptor.getSnapshotRateLimiter(); - - List tableDirectories = Directories.getKSChildDirectories(keyspace); - Directories.clearSnapshot(snapshotName, tableDirectories, clearSnapshotRateLimiter); - } - /** * @return A list of open SSTableReaders */ diff --git a/src/java/org/apache/cassandra/service/SnapshotVerbHandler.java b/src/java/org/apache/cassandra/service/SnapshotVerbHandler.java index 850c98207991..ecf16fed924e 100644 --- a/src/java/org/apache/cassandra/service/SnapshotVerbHandler.java +++ b/src/java/org/apache/cassandra/service/SnapshotVerbHandler.java @@ -37,7 +37,7 @@ public void doVerb(Message message) SnapshotCommand command = message.payload; if (command.clear_snapshot) { - Keyspace.clearSnapshot(command.snapshot_name, command.keyspace); + StorageService.instance.clearSnapshot(command.snapshot_name, command.keyspace); } else if (DiagnosticSnapshotService.isDiagnosticSnapshotRequest(command)) { diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java index 80ffa80a8c57..f254c3422190 100644 --- a/src/java/org/apache/cassandra/service/StorageService.java +++ b/src/java/org/apache/cassandra/service/StorageService.java @@ -4190,7 +4190,7 @@ private Keyspace getValidKeyspace(String keyspaceName) * Remove the snapshot with the given name from the given keyspaces. * If no tag is specified we will remove all snapshots. */ - public void clearSnapshot(String tag, String... keyspaceNames) throws IOException + public void clearSnapshot(String tag, String... keyspaceNames) { if(tag == null) tag = ""; @@ -4208,12 +4208,27 @@ public void clearSnapshot(String tag, String... keyspaceNames) throws IOExceptio } for (String keyspace : keyspaces) - Keyspace.clearSnapshot(tag, keyspace); + clearKeyspaceSnapshot(keyspace, tag); if (logger.isDebugEnabled()) logger.debug("Cleared out snapshot directories"); } + /** + * Clear snapshots for a given keyspace. + * @param keyspace keyspace to remove snapshots for + * @param tag the user supplied snapshot name. If empty or null, all the snapshots will be cleaned + */ + private void clearKeyspaceSnapshot(String keyspace, String tag) + { + Set snapshotsToClear = new SnapshotLoader().loadSnapshots(keyspace) + .stream() + .filter(TableSnapshot.shouldClearSnapshot(tag)) + .collect(Collectors.toSet()); + for (TableSnapshot snapshot : snapshotsToClear) + snapshotManager.clearSnapshot(snapshot); + } + public Map getSnapshotDetails(Map options) { boolean skipExpiring = options != null && Boolean.parseBoolean(options.getOrDefault("no_ttl", "false")); diff --git a/src/java/org/apache/cassandra/service/snapshot/SnapshotLoader.java b/src/java/org/apache/cassandra/service/snapshot/SnapshotLoader.java index 100606e33728..b93fb56f3125 100644 --- a/src/java/org/apache/cassandra/service/snapshot/SnapshotLoader.java +++ b/src/java/org/apache/cassandra/service/snapshot/SnapshotLoader.java @@ -54,7 +54,8 @@ public class SnapshotLoader extends SimpleFileVisitor { private static final Logger logger = LoggerFactory.getLogger(SnapshotLoader.class); - static final Pattern SNAPSHOT_DIR_PATTERN = Pattern.compile("(?\\w+)/(?\\w+)\\-(?[0-9a-f]{32})/snapshots/(?[\\w-]+)$"); + static final Pattern SNAPSHOT_DIR_PATTERN = Pattern.compile("(?\\w+)/(?\\w+)-(?[0-9a-f]{32})/snapshots/(?.+)$"); + private static final Pattern UUID_PATTERN = Pattern.compile("([0-9a-f]{8})([0-9a-f]{4})([0-9a-f]{4})([0-9a-f]{4})([0-9a-f]+)"); private final Collection dataDirectories; private final Map snapshots = new HashMap<>(); @@ -79,15 +80,21 @@ public SnapshotLoader(Directories directories) this(directories.getCFDirectories().stream().map(File::toPath).collect(Collectors.toList())); } - public Set loadSnapshots() + public Set loadSnapshots(String keyspace) { + // if we supply a keyspace, the walking max depth will be suddenly shorther + // because we are one level down in the directory structure + int maxDepth = keyspace == null ? 5 : 4; for (Path dataDir : dataDirectories) { + if (keyspace != null) + dataDir = dataDir.resolve(keyspace); + try { if (new File(dataDir).exists()) { - Files.walkFileTree(dataDir, Collections.EMPTY_SET, 5, this); + Files.walkFileTree(dataDir, Collections.emptySet(), maxDepth, this); } else { @@ -102,6 +109,11 @@ public Set loadSnapshots() return snapshots.values().stream().map(TableSnapshot.Builder::build).collect(Collectors.toSet()); } + public Set loadSnapshots() + { + return loadSnapshots(null); + } + @Override public FileVisitResult visitFileFailed(Path file, IOException exc) throws IOException { // Cassandra can remove some files while traversing the tree, @@ -159,7 +171,7 @@ private void loadSnapshotFromDir(Matcher snapshotDirMatcher, Path snapshotDir) protected static UUID parseUUID(String uuidWithoutDashes) throws IllegalArgumentException { assert uuidWithoutDashes.length() == 32 && !uuidWithoutDashes.contains("-"); - String dashedUUID = uuidWithoutDashes.replaceFirst("([0-9a-f]{8})([0-9a-f]{4})([0-9a-f]{4})([0-9a-f]{4})([0-9a-f]+)", "$1-$2-$3-$4-$5"); + String dashedUUID = UUID_PATTERN.matcher(uuidWithoutDashes).replaceFirst("$1-$2-$3-$4-$5"); return UUID.fromString(dashedUUID); } } diff --git a/src/java/org/apache/cassandra/service/snapshot/SnapshotManager.java b/src/java/org/apache/cassandra/service/snapshot/SnapshotManager.java index a6eeb8858d7e..9c9e7f13f2af 100644 --- a/src/java/org/apache/cassandra/service/snapshot/SnapshotManager.java +++ b/src/java/org/apache/cassandra/service/snapshot/SnapshotManager.java @@ -150,7 +150,7 @@ protected synchronized void clearExpiredSnapshots() /** * Deletes snapshot and remove it from manager */ - protected void clearSnapshot(TableSnapshot snapshot) + public void clearSnapshot(TableSnapshot snapshot) { for (File snapshotDir : snapshot.getDirectories()) { diff --git a/src/java/org/apache/cassandra/service/snapshot/TableSnapshot.java b/src/java/org/apache/cassandra/service/snapshot/TableSnapshot.java index 243af1925ab5..b562d9c59f05 100644 --- a/src/java/org/apache/cassandra/service/snapshot/TableSnapshot.java +++ b/src/java/org/apache/cassandra/service/snapshot/TableSnapshot.java @@ -28,6 +28,7 @@ import java.util.Optional; import java.util.Set; import java.util.UUID; +import java.util.function.Predicate; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -323,4 +324,19 @@ static File getLiveFileFromSnapshotFile(Path snapshotFilePath) } return new File(liveDir.toString(), snapshotFilePath.getFileName().toString()); } + + public static Predicate shouldClearSnapshot(String tag) + { + return ts -> + { + // When no tag is supplied, all snapshots must be cleared + boolean clearAll = tag == null || tag.isEmpty(); + if (!clearAll && ts.isEphemeral()) + logger.info("Skipping deletion of ephemeral snapshot '{}' in keyspace {}. " + + "Ephemeral snapshots are not removable by a user.", + tag, ts.keyspaceName); + return !ts.isEphemeral() && (clearAll || ts.tag.equals(tag)); + }; + } + } diff --git a/test/distributed/org/apache/cassandra/distributed/test/EphemeralSnapshotTest.java b/test/distributed/org/apache/cassandra/distributed/test/EphemeralSnapshotTest.java index a9e804d071e3..2de8f54694bc 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/EphemeralSnapshotTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/EphemeralSnapshotTest.java @@ -47,6 +47,7 @@ public class EphemeralSnapshotTest extends TestBaseImpl { private static final String snapshotName = "snapshotname"; + private static final String snapshotName2 = "second-snapshot"; private static final String tableName = "city"; @Test @@ -58,14 +59,7 @@ public void testStartupRemovesEphemeralSnapshotOnEphemeralFlagInManifest() throw { Pair initialisationData = initialise(c); - String tableId = initialisationData.left; - String[] dataDirs = initialisationData.right; - - // rewrite manifest, pretend that it is ephemeral - Path manifestPath = findManifest(dataDirs, tableId); - SnapshotManifest manifest = SnapshotManifest.deserializeFromJsonFile(new File(manifestPath)); - SnapshotManifest manifestWithEphemeralFlag = new SnapshotManifest(manifest.files, null, manifest.createdAt, true); - manifestWithEphemeralFlag.serializeToJsonFile(new File(manifestPath)); + rewriteManifestToEphemeral(initialisationData.left, initialisationData.right); verify(c.get(1)); } @@ -98,6 +92,47 @@ public void testStartupRemovesEphemeralSnapshotOnMarkerFile() throws Exception } } + @Test + public void testEphemeralSnapshotIsNotClearableFromNodetool() throws Exception + { + try (Cluster c = init(builder().withNodes(1) + .withConfig(config -> config.with(GOSSIP, NETWORK, NATIVE_PROTOCOL)) + .start())) + { + IInvokableInstance instance = c.get(1); + + Pair initialisationData = initialise(c); + rewriteManifestToEphemeral(initialisationData.left, initialisationData.right); + + assertTrue(instance.nodetoolResult("listsnapshots", "-e").getStdout().contains(snapshotName)); + instance.nodetoolResult("clearsnapshot", "-t", snapshotName).asserts().success(); + // ephemeral snapshot was not removed as it can not be (from nodetool / user operation) + assertTrue(instance.nodetoolResult("listsnapshots", "-e").getStdout().contains(snapshotName)); + + assertFalse(instance.logs().grep("Skipping deletion of ephemeral snapshot 'snapshotname' in keyspace distributed_test_keyspace. " + + "Ephemeral snapshots are not removable by a user.").getResult().isEmpty()); + } + } + + @Test + public void testClearingAllSnapshotsFromNodetoolWillKeepEphemeralSnaphotsIntact() throws Exception + { + try (Cluster c = init(builder().withNodes(1) + .withConfig(config -> config.with(GOSSIP, NETWORK, NATIVE_PROTOCOL)) + .start())) + { + IInvokableInstance instance = c.get(1); + + Pair initialisationData = initialise(c); + + rewriteManifestToEphemeral(initialisationData.left, initialisationData.right); + + instance.nodetoolResult("clearsnapshot", "--all").asserts().success(); + assertTrue(instance.nodetoolResult("listsnapshots", "-e").getStdout().contains(snapshotName)); + assertFalse(instance.nodetoolResult("listsnapshots", "-e").getStdout().contains(snapshotName2)); + } + } + private Pair initialise(Cluster c) { c.schemaChange(withKeyspace("CREATE TABLE IF NOT EXISTS %s." + tableName + " (cityid int PRIMARY KEY, name text)")); @@ -109,6 +144,11 @@ private Pair initialise(Cluster c) assertEquals(0, instance.nodetool("snapshot", "-kt", withKeyspace("%s." + tableName), "-t", snapshotName)); waitForSnapshot(instance, snapshotName); + // take one more snapshot, this one is not ephemeral, + // starting Cassandra will clear ephemerals, but it will not affect non-ephemeral snapshots + assertEquals(0, instance.nodetool("snapshot", "-kt", withKeyspace("%s." + tableName), "-t", snapshotName2)); + waitForSnapshot(instance, snapshotName2); + String tableId = instance.callOnInstance((IIsolatedExecutor.SerializableCallable) () -> { return Keyspace.open(KEYSPACE).getMetadata().tables.get(tableName).get().id.asUUID().toString().replaceAll("-", ""); }); @@ -122,17 +162,18 @@ private Pair initialise(Cluster c) private void verify(IInvokableInstance instance) { // by default, we do not see ephemerals - assertFalse(instance.nodetoolResult("listsnapshots").getStdout().contains("snapshotname")); + assertFalse(instance.nodetoolResult("listsnapshots").getStdout().contains(snapshotName)); // we see them via -e flag - assertTrue(instance.nodetoolResult("listsnapshots", "-e").getStdout().contains("snapshotname")); + assertTrue(instance.nodetoolResult("listsnapshots", "-e").getStdout().contains(snapshotName)); Futures.getUnchecked(instance.shutdown()); - // startup should remove ephemeral marker file + // startup should remove ephemeral snapshot instance.startup(); - assertFalse(instance.nodetoolResult("listsnapshots", "-e").getStdout().contains("snapshotname")); + assertFalse(instance.nodetoolResult("listsnapshots", "-e").getStdout().contains(snapshotName)); + assertTrue(instance.nodetoolResult("listsnapshots", "-e").getStdout().contains(snapshotName2)); } private void waitForSnapshot(IInvokableInstance instance, String snapshotName) @@ -142,6 +183,15 @@ private void waitForSnapshot(IInvokableInstance instance, String snapshotName) .until(() -> instance.nodetoolResult("listsnapshots", "-e").getStdout().contains(snapshotName)); } + private void rewriteManifestToEphemeral(String tableId, String[] dataDirs) throws Exception + { + // rewrite manifest, pretend that it is ephemeral + Path manifestPath = findManifest(dataDirs, tableId); + SnapshotManifest manifest = SnapshotManifest.deserializeFromJsonFile(new File(manifestPath)); + SnapshotManifest manifestWithEphemeralFlag = new SnapshotManifest(manifest.files, null, manifest.createdAt, true); + manifestWithEphemeralFlag.serializeToJsonFile(new File(manifestPath)); + } + private Path findManifest(String[] dataDirs, String tableId) { for (String dataDir : dataDirs) diff --git a/test/unit/org/apache/cassandra/db/SystemKeyspaceTest.java b/test/unit/org/apache/cassandra/db/SystemKeyspaceTest.java index bf59b62a39bb..db58fd6d4968 100644 --- a/test/unit/org/apache/cassandra/db/SystemKeyspaceTest.java +++ b/test/unit/org/apache/cassandra/db/SystemKeyspaceTest.java @@ -33,6 +33,7 @@ import org.apache.cassandra.locator.InetAddressAndPort; import org.apache.cassandra.schema.SchemaConstants; import org.apache.cassandra.schema.SchemaKeyspace; +import org.apache.cassandra.service.StorageService; import org.apache.cassandra.transport.ProtocolVersion; import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.CassandraVersion; @@ -105,14 +106,14 @@ public void snapshotSystemKeyspaceIfUpgrading() throws IOException // First, check that in the absence of any previous installed version, we don't create snapshots for (ColumnFamilyStore cfs : Keyspace.open(SchemaConstants.SYSTEM_KEYSPACE_NAME).getColumnFamilyStores()) cfs.clearUnsafe(); - Keyspace.clearSnapshot(null, SchemaConstants.SYSTEM_KEYSPACE_NAME); + StorageService.instance.clearSnapshot(null, SchemaConstants.SYSTEM_KEYSPACE_NAME); SystemKeyspace.snapshotOnVersionChange(); assertDeleted(); // now setup system.local as if we're upgrading from a previous version setupReleaseVersion(getOlderVersionString()); - Keyspace.clearSnapshot(null, SchemaConstants.SYSTEM_KEYSPACE_NAME); + StorageService.instance.clearSnapshot(null, SchemaConstants.SYSTEM_KEYSPACE_NAME); assertDeleted(); // Compare versions again & verify that snapshots were created for all tables in the system ks @@ -125,7 +126,7 @@ public void snapshotSystemKeyspaceIfUpgrading() throws IOException // clear out the snapshots & set the previous recorded version equal to the latest, we shouldn't // see any new snapshots created this time. - Keyspace.clearSnapshot(null, SchemaConstants.SYSTEM_KEYSPACE_NAME); + StorageService.instance.clearSnapshot(null, SchemaConstants.SYSTEM_KEYSPACE_NAME); setupReleaseVersion(FBUtilities.getReleaseVersionString()); SystemKeyspace.snapshotOnVersionChange(); @@ -134,7 +135,7 @@ public void snapshotSystemKeyspaceIfUpgrading() throws IOException // 10 files expected. assertDeleted(); - Keyspace.clearSnapshot(null, SchemaConstants.SYSTEM_KEYSPACE_NAME); + StorageService.instance.clearSnapshot(null, SchemaConstants.SYSTEM_KEYSPACE_NAME); } @Test diff --git a/test/unit/org/apache/cassandra/service/snapshot/SnapshotLoaderTest.java b/test/unit/org/apache/cassandra/service/snapshot/SnapshotLoaderTest.java index a4a6ad2bad8d..ede9f18343cf 100644 --- a/test/unit/org/apache/cassandra/service/snapshot/SnapshotLoaderTest.java +++ b/test/unit/org/apache/cassandra/service/snapshot/SnapshotLoaderTest.java @@ -127,6 +127,23 @@ public void testSnapshotsWithoutManifests() throws IOException assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_1, TABLE1_NAME, TABLE1_ID, TAG1, null, null, tag1Files, false)); assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_1, TABLE2_NAME, TABLE2_ID, TAG2, null, null, tag2Files, false)); assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_2, TABLE3_NAME, TABLE3_ID, TAG3, null, null, tag3Files, false)); + + // Verify snapshot loading for a specific keyspace + loader = new SnapshotLoader(Arrays.asList(Paths.get(baseDir.toString(), DATA_DIR_1), + Paths.get(baseDir.toString(), DATA_DIR_2), + Paths.get(baseDir.toString(), DATA_DIR_3))); + + snapshots = loader.loadSnapshots(KEYSPACE_1); + assertThat(snapshots).hasSize(2); + assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_1, TABLE1_NAME, TABLE1_ID, TAG1, null, null, tag1Files, false)); + assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_1, TABLE2_NAME, TABLE2_ID, TAG2, null, null, tag2Files, false)); + + loader = new SnapshotLoader(Arrays.asList(Paths.get(baseDir.toString(), DATA_DIR_1), + Paths.get(baseDir.toString(), DATA_DIR_2), + Paths.get(baseDir.toString(), DATA_DIR_3))); + snapshots = loader.loadSnapshots(KEYSPACE_2); + assertThat(snapshots).hasSize(1); + assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_2, TABLE3_NAME, TABLE3_ID, TAG3, null, null, tag3Files, false)); } @Test @@ -204,6 +221,23 @@ public void testSnapshotsWithManifests() throws IOException assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_1, TABLE1_NAME, TABLE1_ID, TAG1, tag1Ts, null, tag1Files, false)); assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_1, TABLE2_NAME, TABLE2_ID, TAG2, tag2Ts, tag2Ts.plusSeconds(tag2Ttl.toSeconds()), tag2Files, false)); assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_2, TABLE3_NAME, TABLE3_ID, TAG3, tag3Ts, null, tag3Files, false)); + + // Verify snapshot loading for a specific keyspace + loader = new SnapshotLoader(Arrays.asList(Paths.get(baseDir.toString(), DATA_DIR_1), + Paths.get(baseDir.toString(), DATA_DIR_2), + Paths.get(baseDir.toString(), DATA_DIR_3))); + + snapshots = loader.loadSnapshots(KEYSPACE_1); + assertThat(snapshots).hasSize(2); + assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_1, TABLE1_NAME, TABLE1_ID, TAG1, tag1Ts, null, tag1Files, false)); + assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_1, TABLE2_NAME, TABLE2_ID, TAG2, tag2Ts, tag2Ts.plusSeconds(tag2Ttl.toSeconds()), tag2Files, false)); + + loader = new SnapshotLoader(Arrays.asList(Paths.get(baseDir.toString(), DATA_DIR_1), + Paths.get(baseDir.toString(), DATA_DIR_2), + Paths.get(baseDir.toString(), DATA_DIR_3))); + snapshots = loader.loadSnapshots(KEYSPACE_2); + assertThat(snapshots).hasSize(1); + assertThat(snapshots).contains(new TableSnapshot(KEYSPACE_2, TABLE3_NAME, TABLE3_ID, TAG3, tag3Ts, null, tag3Files, false)); } @Test From 377e114cb1459895423c292cb0bf7f921fd30e43 Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Thu, 25 Aug 2022 15:27:24 -0400 Subject: [PATCH 060/159] Introduce compaction priorities to prevent upgrade compaction inability to finish Patch by Alex Petrov; reviewed by Josh McKenzie and Marcus Eriksson for CASSANDRA-17851 Co-authored-by: Alex Petrov Co-authored-by: Josh McKenzie --- CHANGES.txt | 1 + .../cassandra/db/ColumnFamilyStore.java | 42 +++- .../db/compaction/CompactionManager.java | 131 ++++++---- .../compaction/CompactionStrategyManager.java | 4 +- .../db/compaction/OperationType.java | 60 +++-- .../db/repair/PendingAntiCompaction.java | 7 +- .../distributed/test/PreviewRepairTest.java | 31 +-- .../distributed/test/UpgradeSSTablesTest.java | 223 +++++++++++++++++- .../LongLeveledCompactionStrategyTest.java | 4 +- .../db/compaction/CancelCompactionsTest.java | 19 +- .../db/repair/PendingAntiCompactionTest.java | 27 ++- 11 files changed, 421 insertions(+), 128 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 6d6c736e1ee6..a3548e313d44 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Introduce compaction priorities to prevent upgrade compaction inability to finish (CASSANDRA-17851) * Prevent a user from manually removing ephemeral snapshots (CASSANDRA-17757) * Remove dependency on Maven Ant Tasks (CASSANDRA-17750) * Update ASM(9.1 to 9.3), Mockito(1.10.10 to 1.12.13) and ByteBuddy(3.2.4 to 4.7.0) (CASSANDRA-17835) diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java index de1033ae0c7b..e4b9d781e79f 100644 --- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java +++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java @@ -83,6 +83,7 @@ import org.apache.cassandra.db.commitlog.CommitLog; import org.apache.cassandra.db.commitlog.CommitLogPosition; import org.apache.cassandra.db.compaction.AbstractCompactionStrategy; +import org.apache.cassandra.db.compaction.CompactionInfo; import org.apache.cassandra.db.compaction.CompactionManager; import org.apache.cassandra.db.compaction.CompactionStrategyManager; import org.apache.cassandra.db.compaction.OperationType; @@ -1800,7 +1801,7 @@ public CleanupSummary releaseRepairData(Collection sessions, boolean f return session != null && sessions.contains(session); }; return runWithCompactionsDisabled(() -> compactionStrategyManager.releaseRepairData(sessions), - predicate, false, true, true); + predicate, OperationType.STREAM, false, true, true); } else { @@ -2539,7 +2540,7 @@ public void clearUnsafe() cfs.runWithCompactionsDisabled((Callable) () -> { cfs.data.reset(memtableFactory.create(new AtomicReference<>(CommitLogPosition.NONE), cfs.metadata, cfs)); return null; - }, true, false); + }, OperationType.P0, true, false); } } @@ -2628,7 +2629,7 @@ public void run() } }; - runWithCompactionsDisabled(FutureTask.callable(truncateRunnable), true, true); + runWithCompactionsDisabled(FutureTask.callable(truncateRunnable), OperationType.P0, true, true); viewManager.build(); @@ -2659,9 +2660,9 @@ public void unloadCf() FBUtilities.waitOnFuture(dumpMemtable()); } - public V runWithCompactionsDisabled(Callable callable, boolean interruptValidation, boolean interruptViews) + public V runWithCompactionsDisabled(Callable callable, OperationType operationType, boolean interruptValidation, boolean interruptViews) { - return runWithCompactionsDisabled(callable, (sstable) -> true, interruptValidation, interruptViews, true); + return runWithCompactionsDisabled(callable, (sstable) -> true, operationType, interruptValidation, interruptViews, true); } /** @@ -2674,13 +2675,13 @@ public V runWithCompactionsDisabled(Callable callable, boolean interruptV * @param interruptIndexes if we should interrupt compactions on indexes. NOTE: if you set this to true your sstablePredicate * must be able to handle LocalPartitioner sstables! */ - public V runWithCompactionsDisabled(Callable callable, Predicate sstablesPredicate, boolean interruptValidation, boolean interruptViews, boolean interruptIndexes) + public V runWithCompactionsDisabled(Callable callable, Predicate sstablesPredicate, OperationType operationType, boolean interruptValidation, boolean interruptViews, boolean interruptIndexes) { // synchronize so that concurrent invocations don't re-enable compactions partway through unexpectedly, // and so we only run one major compaction at a time synchronized (this) { - logger.trace("Cancelling in-progress compactions for {}", metadata.name); + logger.debug("Cancelling in-progress compactions for {}", metadata.name); Iterable toInterruptFor = interruptIndexes ? concatWithIndexes() : Collections.singleton(this); @@ -2689,9 +2690,24 @@ public V runWithCompactionsDisabled(Callable callable, Predicate toInterruptForMetadata = Iterables.transform(toInterruptFor, ColumnFamilyStore::metadata); + try (CompactionManager.CompactionPauser pause = CompactionManager.instance.pauseGlobalCompaction(); CompactionManager.CompactionPauser pausedStrategies = pauseCompactionStrategies(toInterruptFor)) { + List uninterruptibleTasks = CompactionManager.instance.getCompactionsMatching(toInterruptForMetadata, + (info) -> info.getTaskType().priority <= operationType.priority); + if (!uninterruptibleTasks.isEmpty()) + { + logger.info("Unable to cancel in-progress compactions, since they're running with higher or same priority: {}. You can abort these operations using `nodetool stop`.", + uninterruptibleTasks.stream().map((compaction) -> String.format("%s@%s (%s)", + compaction.getCompactionInfo().getTaskType(), + compaction.getCompactionInfo().getTable(), + compaction.getCompactionInfo().getTaskId())) + .collect(Collectors.joining(","))); + return null; + } + // interrupt in-progress compactions CompactionManager.instance.interruptCompactionForCFs(toInterruptFor, sstablesPredicate, interruptValidation); CompactionManager.instance.waitForCessation(toInterruptFor, sstablesPredicate); @@ -2701,7 +2717,9 @@ public V runWithCompactionsDisabled(Callable callable, Predicate T withAllSSTables(final OperationType operationType, Function op) { Callable callable = () -> { assert data.getCompacting().isEmpty() : data.getCompacting(); @@ -2767,10 +2785,12 @@ public LifecycleTransaction markAllCompacting(final OperationType operationType) return modifier; }; - return runWithCompactionsDisabled(callable, false, false); + try (LifecycleTransaction compacting = runWithCompactionsDisabled(callable, operationType, false, false)) + { + return op.apply(compacting); + } } - @Override public String toString() { diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java index 5906ac294af5..dc22b6712ab0 100644 --- a/src/java/org/apache/cassandra/db/compaction/CompactionManager.java +++ b/src/java/org/apache/cassandra/db/compaction/CompactionManager.java @@ -369,68 +369,70 @@ public BackgroundCompactionCandidate getBackgroundCompactionCandidate(ColumnFami * @throws InterruptedException */ @SuppressWarnings("resource") - private AllSSTableOpStatus parallelAllSSTableOperation(final ColumnFamilyStore cfs, final OneSSTableOperation operation, int jobs, OperationType operationType) throws ExecutionException, InterruptedException + private AllSSTableOpStatus parallelAllSSTableOperation(final ColumnFamilyStore cfs, final OneSSTableOperation operation, int jobs, OperationType operationType) { - logger.info("Starting {} for {}.{}", operationType, cfs.keyspace.getName(), cfs.getTableName()); - List transactions = new ArrayList<>(); - List> futures = new ArrayList<>(); - try (LifecycleTransaction compacting = cfs.markAllCompacting(operationType)) - { - if (compacting == null) - return AllSSTableOpStatus.UNABLE_TO_CANCEL; - - Iterable sstables = Lists.newArrayList(operation.filterSSTables(compacting)); - if (Iterables.isEmpty(sstables)) + return cfs.withAllSSTables(operationType, (compacting) -> { + logger.info("Starting {} for {}.{}", operationType, cfs.keyspace.getName(), cfs.getTableName()); + List transactions = new ArrayList<>(); + List> futures = new ArrayList<>(); + try { - logger.info("No sstables to {} for {}.{}", operationType.name(), cfs.keyspace.getName(), cfs.name); - return AllSSTableOpStatus.SUCCESSFUL; - } + if (compacting == null) + return AllSSTableOpStatus.UNABLE_TO_CANCEL; - for (final SSTableReader sstable : sstables) - { - final LifecycleTransaction txn = compacting.split(singleton(sstable)); - transactions.add(txn); - Callable callable = new Callable() + Iterable sstables = Lists.newArrayList(operation.filterSSTables(compacting)); + if (Iterables.isEmpty(sstables)) { - @Override - public Object call() throws Exception - { - operation.execute(txn); - return this; - } - }; - Future fut = executor.submitIfRunning(callable, "paralell sstable operation"); - if (!fut.isCancelled()) - futures.add(fut); - else - return AllSSTableOpStatus.ABORTED; + logger.info("No sstables to {} for {}.{}", operationType.name(), cfs.keyspace.getName(), cfs.name); + return AllSSTableOpStatus.SUCCESSFUL; + } - if (jobs > 0 && futures.size() == jobs) + for (final SSTableReader sstable : sstables) { - Future f = FBUtilities.waitOnFirstFuture(futures); - futures.remove(f); + final LifecycleTransaction txn = compacting.split(singleton(sstable)); + transactions.add(txn); + Callable callable = new Callable() + { + @Override + public Object call() throws Exception + { + operation.execute(txn); + return this; + } + }; + Future fut = executor.submitIfRunning(callable, "paralell sstable operation"); + if (!fut.isCancelled()) + futures.add(fut); + else + return AllSSTableOpStatus.ABORTED; + + if (jobs > 0 && futures.size() == jobs) + { + Future f = FBUtilities.waitOnFirstFuture(futures); + futures.remove(f); + } } - } - FBUtilities.waitOnFutures(futures); - assert compacting.originals().isEmpty(); - logger.info("Finished {} for {}.{} successfully", operationType, cfs.keyspace.getName(), cfs.getTableName()); - return AllSSTableOpStatus.SUCCESSFUL; - } - finally - { - // wait on any unfinished futures to make sure we don't close an ongoing transaction - try - { FBUtilities.waitOnFutures(futures); + assert compacting.originals().isEmpty(); + logger.info("Finished {} for {}.{} successfully", operationType, cfs.keyspace.getName(), cfs.getTableName()); + return AllSSTableOpStatus.SUCCESSFUL; } - catch (Throwable t) + finally { - // these are handled/logged in CompactionExecutor#afterExecute + // wait on any unfinished futures to make sure we don't close an ongoing transaction + try + { + FBUtilities.waitOnFutures(futures); + } + catch (Throwable t) + { + // these are handled/logged in CompactionExecutor#afterExecute + } + Throwable fail = Throwables.close(null, transactions); + if (fail != null) + logger.error("Failed to cleanup lifecycle transactions ({} for {}.{})", operationType, cfs.keyspace.getName(), cfs.getTableName(), fail); } - Throwable fail = Throwables.close(null, transactions); - if (fail != null) - logger.error("Failed to cleanup lifecycle transactions ({} for {}.{})", operationType, cfs.keyspace.getName(), cfs.getTableName(), fail); - } + }); } private static interface OneSSTableOperation @@ -914,11 +916,17 @@ public void performMaximal(final ColumnFamilyStore cfStore, boolean splitOutput) @SuppressWarnings("resource") // the tasks are executed in parallel on the executor, making sure that they get closed public List> submitMaximal(final ColumnFamilyStore cfStore, final int gcBefore, boolean splitOutput) + { + return submitMaximal(cfStore, gcBefore, splitOutput, OperationType.MAJOR_COMPACTION); + } + + @SuppressWarnings("resource") + public List> submitMaximal(final ColumnFamilyStore cfStore, final int gcBefore, boolean splitOutput, OperationType operationType) { // here we compute the task off the compaction executor, so having that present doesn't // confuse runWithCompactionsDisabled -- i.e., we don't want to deadlock ourselves, waiting // for ourselves to finish/acknowledge cancellation before continuing. - CompactionTasks tasks = cfStore.getCompactionStrategyManager().getMaximalTasks(gcBefore, splitOutput); + CompactionTasks tasks = cfStore.getCompactionStrategyManager().getMaximalTasks(gcBefore, splitOutput, operationType); if (tasks.isEmpty()) return Collections.emptyList(); @@ -963,6 +971,7 @@ public void forceCompaction(ColumnFamilyStore cfStore, Supplier getCompactionsMatching(Iterable columnFamilies, Predicate predicate) + { + Preconditions.checkArgument(columnFamilies != null, "Attempted to getCompactionsMatching in CompactionManager with no columnFamilies specified."); + + List matched = new ArrayList<>(); + // consider all in-progress compactions + for (Holder holder : active.getCompactions()) + { + CompactionInfo info = holder.getCompactionInfo(); + if (info.getTableMetadata() == null || Iterables.contains(columnFamilies, info.getTableMetadata())) + { + if (predicate.test(info)) + matched.add(holder); + } + } + return matched; + } + /** * Try to stop all of the compactions for given ColumnFamilies. * diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java index ca67ddb0ea6f..808ea9ecd6eb 100644 --- a/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java +++ b/src/java/org/apache/cassandra/db/compaction/CompactionStrategyManager.java @@ -991,7 +991,7 @@ private void validateForCompaction(Iterable input) } } - public CompactionTasks getMaximalTasks(final int gcBefore, final boolean splitOutput) + public CompactionTasks getMaximalTasks(final int gcBefore, final boolean splitOutput, OperationType operationType) { maybeReloadDiskBoundaries(); // runWithCompactionsDisabled cancels active compactions and disables them, then we are able @@ -1012,7 +1012,7 @@ public CompactionTasks getMaximalTasks(final int gcBefore, final boolean splitOu readLock.unlock(); } return CompactionTasks.create(tasks); - }, false, false); + }, operationType, false, false); } /** diff --git a/src/java/org/apache/cassandra/db/compaction/OperationType.java b/src/java/org/apache/cassandra/db/compaction/OperationType.java index e957e42c9df5..a15693fe83a8 100644 --- a/src/java/org/apache/cassandra/db/compaction/OperationType.java +++ b/src/java/org/apache/cassandra/db/compaction/OperationType.java @@ -20,35 +20,51 @@ public enum OperationType { /** Each modification here should be also applied to {@link org.apache.cassandra.tools.nodetool.Stop#compactionType} */ - COMPACTION("Compaction"), - VALIDATION("Validation"), - KEY_CACHE_SAVE("Key cache save"), - ROW_CACHE_SAVE("Row cache save"), - COUNTER_CACHE_SAVE("Counter cache save"), - CLEANUP("Cleanup"), - SCRUB("Scrub"), - UPGRADE_SSTABLES("Upgrade sstables"), - INDEX_BUILD("Secondary index build"), - /** Compaction for tombstone removal */ - TOMBSTONE_COMPACTION("Tombstone Compaction"), - UNKNOWN("Unknown compaction type"), - ANTICOMPACTION("Anticompaction after repair"), - VERIFY("Verify"), - FLUSH("Flush"), - STREAM("Stream"), - WRITE("Write"), - VIEW_BUILD("View build"), - INDEX_SUMMARY("Index summary redistribution"), - RELOCATE("Relocate sstables to correct disk"), - GARBAGE_COLLECT("Remove deleted data"); + P0("Cancel all operations", 0), + + // Automation or operator-driven tasks + CLEANUP("Cleanup", 1), + SCRUB("Scrub", 1), + UPGRADE_SSTABLES("Upgrade sstables", 1), + VERIFY("Verify", 1), + MAJOR_COMPACTION("Major compaction", 1), + RELOCATE("Relocate sstables to correct disk", 1), + GARBAGE_COLLECT("Remove deleted data", 1), + + // Internal SSTable writing + FLUSH("Flush", 1), + WRITE("Write", 1), + + ANTICOMPACTION("Anticompaction after repair", 2), + VALIDATION("Validation", 3), + + INDEX_BUILD("Secondary index build", 4), + VIEW_BUILD("View build", 4), + + COMPACTION("Compaction", 5), + TOMBSTONE_COMPACTION("Tombstone Compaction", 5), // Compaction for tombstone removal + UNKNOWN("Unknown compaction type", 5), + + STREAM("Stream", 6), + KEY_CACHE_SAVE("Key cache save", 6), + ROW_CACHE_SAVE("Row cache save", 6), + COUNTER_CACHE_SAVE("Counter cache save", 6), + INDEX_SUMMARY("Index summary redistribution", 6); public final String type; public final String fileName; - OperationType(String type) + // As of now, priority takes part only for interrupting tasks to give way to operator-driven tasks. + // Operation types that have a smaller number will be allowed to cancel ones that have larger numbers. + // + // Submitted tasks may be prioritised differently when forming a queue, if/when CASSANDRA-11218 is implemented. + public final int priority; + + OperationType(String type, int priority) { this.type = type; this.fileName = type.toLowerCase().replace(" ", ""); + this.priority = priority; } public static OperationType fromFileName(String fileName) diff --git a/src/java/org/apache/cassandra/db/repair/PendingAntiCompaction.java b/src/java/org/apache/cassandra/db/repair/PendingAntiCompaction.java index af9888a3f105..a993bac5ee0e 100644 --- a/src/java/org/apache/cassandra/db/repair/PendingAntiCompaction.java +++ b/src/java/org/apache/cassandra/db/repair/PendingAntiCompaction.java @@ -214,6 +214,11 @@ private AcquireResult acquireTuple() return null; } + protected AcquireResult acquireSSTables() + { + return cfs.runWithCompactionsDisabled(this::acquireTuple, predicate, OperationType.ANTICOMPACTION, false, false, false); + } + public AcquireResult call() { logger.debug("acquiring sstables for pending anti compaction on session {}", sessionID); @@ -231,7 +236,7 @@ public AcquireResult call() { // Note that anticompactions are not disabled when running this. This is safe since runWithCompactionsDisabled // is synchronized - acquireTuple and predicate can only be run by a single thread (for the given cfs). - return cfs.runWithCompactionsDisabled(this::acquireTuple, predicate, false, false, false); + return acquireSSTables(); } catch (SSTableAcquisitionException e) { diff --git a/test/distributed/org/apache/cassandra/distributed/test/PreviewRepairTest.java b/test/distributed/org/apache/cassandra/distributed/test/PreviewRepairTest.java index 90e29f2eeac5..a0b643f0d309 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/PreviewRepairTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/PreviewRepairTest.java @@ -73,9 +73,7 @@ import static org.apache.cassandra.distributed.api.Feature.NETWORK; import static org.apache.cassandra.distributed.api.IMessageFilters.Matcher; import static org.apache.cassandra.distributed.impl.Instance.deserializeMessage; -import static org.apache.cassandra.distributed.test.PreviewRepairTest.DelayFirstRepairTypeMessageFilter.finalizePropose; import static org.apache.cassandra.distributed.test.PreviewRepairTest.DelayFirstRepairTypeMessageFilter.validationRequest; -import static org.apache.cassandra.net.Verb.FINALIZE_PROPOSE_MSG; import static org.apache.cassandra.net.Verb.VALIDATION_REQ; import static org.apache.cassandra.service.StorageService.instance; import static org.apache.cassandra.utils.concurrent.Condition.newOneTimeCondition; @@ -187,10 +185,14 @@ public void testFinishingIncRepairDuringPreview() throws IOException, Interrupte previewRepairStarted.await(); // this needs to finish before the preview repair is unpaused on node2 cluster.get(1).callOnInstance(repair(options(false, false))); + RepairResult irResult = cluster.get(1).callOnInstance(repair(options(false, false))); continuePreviewRepair.signalAll(); RepairResult rs = rsFuture.get(); - assertFalse(rs.success); // preview repair should have failed + assertFalse(rs.success); // preview repair was started before IR, but has lower priority, so its task will get cancelled assertFalse(rs.wasInconsistent); // and no mismatches should have been reported + + assertTrue(irResult.success); // IR was started after preview repair, but has a higher priority, so it'll be allowed to finish + assertFalse(irResult.wasInconsistent); } finally { @@ -226,34 +228,21 @@ public void testConcurrentIncRepairDuringPreview() throws IOException, Interrupt .messagesMatching(validationRequest(previewRepairStarted, continuePreviewRepair)) .drop(); - Condition irRepairStarted = newOneTimeCondition(); - Condition continueIrRepair = newOneTimeCondition(); - // this blocks the IR from committing, so we can reenable the preview - cluster.filters() - .outbound() - .verbs(FINALIZE_PROPOSE_MSG.id) - .from(1).to(2) - .messagesMatching(finalizePropose(irRepairStarted, continueIrRepair)) - .drop(); - Future previewResult = cluster.get(1).asyncCallsOnInstance(repair(options(true, false))).call(); previewRepairStarted.await(); - // trigger IR and wait till its ready to commit + // trigger IR and wait till it's ready to commit Future irResult = cluster.get(1).asyncCallsOnInstance(repair(options(false, false))).call(); - irRepairStarted.await(); + RepairResult ir = irResult.get(); + assertTrue(ir.success); // IR was submitted after preview repair has acquired sstables, but has higher priority + assertFalse(ir.wasInconsistent); // not preview, so we don't care about preview notification // unblock preview repair and wait for it to complete continuePreviewRepair.signalAll(); RepairResult rs = previewResult.get(); - assertFalse(rs.success); // preview repair should have failed + assertFalse(rs.success); // preview repair was started earlier than IR session; but has smaller priority assertFalse(rs.wasInconsistent); // and no mismatches should have been reported - - continueIrRepair.signalAll(); - RepairResult ir = irResult.get(); - assertTrue(ir.success); - assertFalse(ir.wasInconsistent); // not preview, so we don't care about preview notification } } diff --git a/test/distributed/org/apache/cassandra/distributed/test/UpgradeSSTablesTest.java b/test/distributed/org/apache/cassandra/distributed/test/UpgradeSSTablesTest.java index c599f17e87b1..445e34988566 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/UpgradeSSTablesTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/UpgradeSSTablesTest.java @@ -19,22 +19,209 @@ package org.apache.cassandra.distributed.test; import java.util.Set; +import java.util.concurrent.Callable; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.Future; +import java.util.concurrent.TimeUnit; import org.junit.Assert; import org.junit.Test; -import org.apache.cassandra.config.DatabaseDescriptor; +import net.bytebuddy.ByteBuddy; +import net.bytebuddy.dynamic.loading.ClassLoadingStrategy; +import net.bytebuddy.implementation.MethodDelegation; +import net.bytebuddy.implementation.bind.annotation.SuperCall; import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.db.compaction.ActiveCompactions; +import org.apache.cassandra.db.compaction.CompactionInfo; +import org.apache.cassandra.db.compaction.CompactionManager; +import org.apache.cassandra.db.compaction.OperationType; import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.distributed.api.ICluster; import org.apache.cassandra.distributed.api.IInvokableInstance; import org.apache.cassandra.distributed.api.LogAction; import org.apache.cassandra.io.sstable.Component; import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.utils.FBUtilities; +import org.apache.cassandra.utils.concurrent.CountDownLatch; + +import static net.bytebuddy.matcher.ElementMatchers.named; +import static org.apache.cassandra.utils.concurrent.CountDownLatch.newCountDownLatch; public class UpgradeSSTablesTest extends TestBaseImpl { + @Test + public void upgradeSSTablesInterruptsOngoingCompaction() throws Throwable + { + try (ICluster cluster = init(builder().withNodes(1).start())) + { + cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v text, PRIMARY KEY (pk, ck));"); + cluster.get(1).acceptsOnInstance((String ks) -> { + ColumnFamilyStore cfs = Keyspace.open(ks).getColumnFamilyStore("tbl"); + cfs.disableAutoCompaction(); + CompactionManager.instance.setMaximumCompactorThreads(1); + CompactionManager.instance.setCoreCompactorThreads(1); + }).accept(KEYSPACE); + + String blob = "blob"; + for (int i = 0; i < 6; i++) + blob += blob; + + for (int cnt = 0; cnt < 5; cnt++) + { + for (int i = 0; i < 100; i++) + { + cluster.coordinator(1).execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (?,?,?)", + ConsistencyLevel.QUORUM, (cnt * 1000) + i, i, blob); + } + cluster.get(1).nodetool("flush", KEYSPACE, "tbl"); + } + + LogAction logAction = cluster.get(1).logs(); + logAction.mark(); + Future future = cluster.get(1).asyncAcceptsOnInstance((String ks) -> { + ColumnFamilyStore cfs = Keyspace.open(ks).getColumnFamilyStore("tbl"); + CompactionManager.instance.submitMaximal(cfs, FBUtilities.nowInSeconds(), false, OperationType.COMPACTION); + }).apply(KEYSPACE); + Assert.assertEquals(0, cluster.get(1).nodetool("upgradesstables", "-a", KEYSPACE, "tbl")); + future.get(); + Assert.assertFalse(logAction.grep("Compaction interrupted").getResult().isEmpty()); + } + } + + @Test + public void compactionDoesNotCancelUpgradeSSTables() throws Throwable + { + try (ICluster cluster = init(builder().withNodes(1).start())) + { + cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v text, PRIMARY KEY (pk, ck));"); + cluster.get(1).acceptsOnInstance((String ks) -> { + ColumnFamilyStore cfs = Keyspace.open(ks).getColumnFamilyStore("tbl"); + cfs.disableAutoCompaction(); + CompactionManager.instance.setMaximumCompactorThreads(1); + CompactionManager.instance.setCoreCompactorThreads(1); + }).accept(KEYSPACE); + + String blob = "blob"; + for (int i = 0; i < 6; i++) + blob += blob; + + for (int cnt = 0; cnt < 5; cnt++) + { + for (int i = 0; i < 100; i++) + { + cluster.coordinator(1).execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (?,?,?)", + ConsistencyLevel.QUORUM, (cnt * 1000) + i, i, blob); + } + cluster.get(1).nodetool("flush", KEYSPACE, "tbl"); + } + + LogAction logAction = cluster.get(1).logs(); + logAction.mark(); + Assert.assertEquals(0, cluster.get(1).nodetool("upgradesstables", "-a", KEYSPACE, "tbl")); + Assert.assertFalse(logAction.watchFor("Compacting").getResult().isEmpty()); + + cluster.get(1).acceptsOnInstance((String ks) -> { + ColumnFamilyStore cfs = Keyspace.open(ks).getColumnFamilyStore("tbl"); + FBUtilities.allOf(CompactionManager.instance.submitMaximal(cfs, FBUtilities.nowInSeconds(), false, OperationType.COMPACTION)) + .awaitUninterruptibly(1, TimeUnit.MINUTES); + + }).accept(KEYSPACE); + Assert.assertTrue(logAction.grep("Compaction interrupted").getResult().isEmpty()); + Assert.assertFalse(logAction.grep("Finished Upgrade sstables").getResult().isEmpty()); + Assert.assertFalse(logAction.grep("Compacted (.*) 5 sstables to").getResult().isEmpty()); + } + } + + @Test + public void cleanupDoesNotInterruptUpgradeSSTables() throws Throwable + { + try (ICluster cluster = init(builder().withNodes(1).withInstanceInitializer(BB::install).start())) + { + cluster.schemaChange("CREATE TABLE " + KEYSPACE + ".tbl (pk int, ck int, v text, PRIMARY KEY (pk, ck));"); + + cluster.get(1).acceptsOnInstance((String ks) -> { + ColumnFamilyStore cfs = Keyspace.open(ks).getColumnFamilyStore("tbl"); + cfs.disableAutoCompaction(); + }).accept(KEYSPACE); + + String blob = "blob"; + for (int i = 0; i < 6; i++) + blob += blob; + + for (int i = 0; i < 10000; i++) + { + cluster.coordinator(1).execute("INSERT INTO " + KEYSPACE + ".tbl (pk, ck, v) VALUES (?,?,?)", + ConsistencyLevel.QUORUM, i, i, blob); + } + + cluster.get(1).nodetool("flush", KEYSPACE, "tbl"); + + LogAction logAction = cluster.get(1).logs(); + logAction.mark(); + + // Start upgradingsstables - use BB to pause once inside ActiveCompactions.beginCompaction + Thread upgradeThread = new Thread(() -> { + cluster.get(1).nodetool("upgradesstables", "-a", KEYSPACE, "tbl"); + }); + upgradeThread.start(); + Assert.assertTrue(cluster.get(1).callOnInstance(() -> BB.starting.awaitUninterruptibly(1, TimeUnit.MINUTES))); + + // Start a scrub and make sure that it fails, log check later to make sure it was + // because it cannot cancel the active upgrade sstables + Assert.assertNotEquals(0, cluster.get(1).nodetool("scrub", KEYSPACE, "tbl")); + + // Now resume the upgrade sstables so test can shut down + cluster.get(1).runOnInstance(() -> { + BB.start.decrement(); + }); + upgradeThread.join(); + + Assert.assertFalse(logAction.grep("Unable to cancel in-progress compactions, since they're running with higher or same priority: Upgrade sstables").getResult().isEmpty()); + Assert.assertFalse(logAction.grep("Starting Scrub for ").getResult().isEmpty()); + Assert.assertFalse(logAction.grep("Finished Upgrade sstables for distributed_test_keyspace.tbl successfully").getResult().isEmpty()); + } + } + + @Test + public void truncateWhileUpgrading() throws Throwable + { + try (ICluster cluster = init(builder().withNodes(1).start())) + { + cluster.schemaChange(withKeyspace("CREATE TABLE %s.tbl (pk int, ck int, v text, PRIMARY KEY (pk, ck)) ")); + cluster.get(1).acceptsOnInstance((String ks) -> { + ColumnFamilyStore cfs = Keyspace.open(ks).getColumnFamilyStore("tbl"); + cfs.disableAutoCompaction(); + CompactionManager.instance.setMaximumCompactorThreads(1); + CompactionManager.instance.setCoreCompactorThreads(1); + }).accept(KEYSPACE); + + String blob = "blob"; + for (int i = 0; i < 10; i++) + blob += blob; + + for (int i = 0; i < 500; i++) + { + cluster.coordinator(1).execute(withKeyspace("INSERT INTO %s.tbl (pk, ck, v) VALUES (?,?,?)"), + ConsistencyLevel.QUORUM, i, i, blob); + if (i > 0 && i % 100 == 0) + cluster.get(1).nodetool("flush", KEYSPACE, "tbl"); + } + + LogAction logAction = cluster.get(1).logs(); + logAction.mark(); + + Future upgrade = CompletableFuture.runAsync(() -> { + cluster.get(1).nodetool("upgradesstables", "-a", KEYSPACE, "tbl"); + }); + + cluster.schemaChange(withKeyspace("TRUNCATE %s.tbl")); + upgrade.get(); + Assert.assertFalse(logAction.grep("Compaction interrupted").getResult().isEmpty()); + } + } + @Test public void rewriteSSTablesTest() throws Throwable { @@ -116,4 +303,38 @@ public void rewriteSSTablesTest() throws Throwable } } } + + public static class BB + { + // Will be initialized in the context of the instance class loader + static CountDownLatch starting = newCountDownLatch(1); + static CountDownLatch start = newCountDownLatch(1); + + public static void install(ClassLoader classLoader, Integer num) + { + new ByteBuddy().rebase(ActiveCompactions.class) + .method(named("beginCompaction")) + .intercept(MethodDelegation.to(BB.class)) + .make() + .load(classLoader, ClassLoadingStrategy.Default.INJECTION); + } + + @SuppressWarnings("unused") + public static void beginCompaction(CompactionInfo.Holder ci, @SuperCall Callable zuperCall) + { + try + { + zuperCall.call(); + if (ci.getCompactionInfo().getTaskType() == OperationType.UPGRADE_SSTABLES) + { + starting.decrement(); + Assert.assertTrue(start.awaitUninterruptibly(1, TimeUnit.MINUTES)); + } + } + catch (Exception e) + { + throw new RuntimeException(e); + } + } + } } diff --git a/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java b/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java index 733e46fd8d88..a780cf1e263b 100644 --- a/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java +++ b/test/long/org/apache/cassandra/db/compaction/LongLeveledCompactionStrategyTest.java @@ -198,9 +198,7 @@ public Void call() throws Exception } return null; } - }, true, true); - - + }, OperationType.COMPACTION, true, true); } @Test diff --git a/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java b/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java index 67421ba6d238..51da0c443124 100644 --- a/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java +++ b/test/unit/org/apache/cassandra/db/compaction/CancelCompactionsTest.java @@ -86,14 +86,16 @@ public void cancelTest() throws InterruptedException assertEquals(1, activeCompactions.size()); assertEquals(activeCompactions.get(0).getCompactionInfo().getSSTables(), toMarkCompacting); // predicate requires the non-compacting sstables, should not cancel the one currently compacting: - cfs.runWithCompactionsDisabled(() -> null, (sstable) -> !toMarkCompacting.contains(sstable), false, false, true); + cfs.runWithCompactionsDisabled(() -> null, (sstable) -> !toMarkCompacting.contains(sstable), + OperationType.P0, false, false, true); assertEquals(1, activeCompactions.size()); assertFalse(activeCompactions.get(0).isStopRequested()); // predicate requires the compacting ones - make sure stop is requested and that when we abort that // compaction we actually run the callable (countdown the latch) CountDownLatch cdl = new CountDownLatch(1); - Thread t = new Thread(() -> cfs.runWithCompactionsDisabled(() -> { cdl.countDown(); return null; }, toMarkCompacting::contains, false, false, true)); + Thread t = new Thread(() -> cfs.runWithCompactionsDisabled(() -> { cdl.countDown(); return null; }, toMarkCompacting::contains, + OperationType.P0, false, false, true)); t.start(); while (!activeCompactions.get(0).isStopRequested()) Thread.sleep(100); @@ -139,13 +141,16 @@ public void multipleCompactionsCancelTest() throws InterruptedException expectedSSTables.add(new HashSet<>(sstables.subList(6, 9))); assertEquals(compactingSSTables, expectedSSTables); - cfs.runWithCompactionsDisabled(() -> null, (sstable) -> false, false, false, true); + cfs.runWithCompactionsDisabled(() -> null, (sstable) -> false, + OperationType.P0, false, false, true); assertEquals(2, activeCompactions.size()); assertTrue(activeCompactions.stream().noneMatch(CompactionInfo.Holder::isStopRequested)); CountDownLatch cdl = new CountDownLatch(1); // start a compaction which only needs the sstables where first token is > 50 - these are the sstables compacted by tcts.get(1) - Thread t = new Thread(() -> cfs.runWithCompactionsDisabled(() -> { cdl.countDown(); return null; }, (sstable) -> first(sstable) > 50, false, false, true)); + Thread t = new Thread(() -> cfs.runWithCompactionsDisabled(() -> { cdl.countDown(); return null; }, + (sstable) -> first(sstable) > 50, + OperationType.P0, false, false, true)); t.start(); activeCompactions = getActiveCompactionsForTable(cfs); assertEquals(2, activeCompactions.size()); @@ -333,7 +338,8 @@ public boolean hasNext() } } assertTrue(foundCompaction); - cfs.runWithCompactionsDisabled(() -> {compactionsStopped.countDown(); return null;}, (sstable) -> true, false, false, true); + cfs.runWithCompactionsDisabled(() -> { compactionsStopped.countDown(); return null; }, + (sstable) -> true, OperationType.P0, false, false, true); // wait for the runWithCompactionsDisabled callable compactionsStopped.await(); assertEquals(1, getActiveCompactionsForTable(cfs).size()); @@ -430,7 +436,8 @@ public void test2iCancellation() throws Throwable Set sstables = new HashSet<>(); try (LifecycleTransaction txn = idx.getTracker().tryModify(idx.getLiveSSTables(), OperationType.COMPACTION)) { - getCurrentColumnFamilyStore().runWithCompactionsDisabled(() -> true, (sstable) -> { sstables.add(sstable); return true;}, false, false, false); + getCurrentColumnFamilyStore().runWithCompactionsDisabled(() -> true, (sstable) -> { sstables.add(sstable); return true;}, + OperationType.P0, false, false, false); } // the predicate only gets compacting sstables, and we are only compacting the 2i sstables - with interruptIndexes = false we should see no sstables here assertTrue(sstables.isEmpty()); diff --git a/test/unit/org/apache/cassandra/db/repair/PendingAntiCompactionTest.java b/test/unit/org/apache/cassandra/db/repair/PendingAntiCompactionTest.java index a559478b87f2..c95b2dbb0e25 100644 --- a/test/unit/org/apache/cassandra/db/repair/PendingAntiCompactionTest.java +++ b/test/unit/org/apache/cassandra/db/repair/PendingAntiCompactionTest.java @@ -41,17 +41,14 @@ import com.google.common.util.concurrent.ListenableFuture; import com.google.common.util.concurrent.MoreExecutors; -import org.apache.cassandra.Util; -import org.apache.cassandra.concurrent.ExecutorPlus; -import org.apache.cassandra.concurrent.FutureTask; -import org.apache.cassandra.concurrent.ImmediateExecutor; -import org.apache.cassandra.utils.TimeUUID; -import org.apache.cassandra.utils.concurrent.Future; import org.junit.Assert; import org.junit.Test; import org.slf4j.Logger; import org.slf4j.LoggerFactory; +import org.apache.cassandra.concurrent.ExecutorPlus; +import org.apache.cassandra.concurrent.FutureTask; +import org.apache.cassandra.concurrent.ImmediateExecutor; import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.QueryProcessor; import org.apache.cassandra.db.ColumnFamilyStore; @@ -78,7 +75,10 @@ import org.apache.cassandra.schema.TableId; import org.apache.cassandra.service.ActiveRepairService; import org.apache.cassandra.streaming.PreviewKind; +import org.apache.cassandra.Util; import org.apache.cassandra.utils.ByteBufferUtil; +import org.apache.cassandra.utils.concurrent.Future; +import org.apache.cassandra.utils.TimeUUID; import org.apache.cassandra.utils.WrappedRunnable; import org.apache.cassandra.utils.concurrent.Transactional; @@ -664,15 +664,24 @@ public boolean isGlobal() { @Override public boolean apply(SSTableReader sstable) + { + return true; + } + }; + + CompactionManager.instance.active.beginCompaction(holder); + PendingAntiCompaction.AcquisitionCallable acquisitionCallable = new PendingAntiCompaction.AcquisitionCallable(cfs, nextTimeUUID(), 10, 1, acp) + { + protected PendingAntiCompaction.AcquireResult acquireSSTables() { cdl.countDown(); if (cdl.getCount() > 0) throw new PendingAntiCompaction.SSTableAcquisitionException("blah"); - return true; + else + CompactionManager.instance.active.finishCompaction(holder); + return super.acquireSSTables(); } }; - CompactionManager.instance.active.beginCompaction(holder); - PendingAntiCompaction.AcquisitionCallable acquisitionCallable = new PendingAntiCompaction.AcquisitionCallable(cfs, nextTimeUUID(), 10, 1, acp); Future f = es.submit(acquisitionCallable); cdl.await(); assertNotNull(f.get()); From b6d8e2ce6bac816279ce0490fd6292aa7e4124b6 Mon Sep 17 00:00:00 2001 From: Stefan Miklosovic Date: Wed, 7 Sep 2022 16:23:01 +0200 Subject: [PATCH 061/159] Make disabling auto snapshot on selected tables possible patch by Tommy Stendahl; reviewed by Stefan Miklosovic and Aleksey Yeschenko for CASSANDRA-10383 Co-authored-by: Tommy Stendahl Co-authored-by: Stefan Miklosovic --- CHANGES.txt | 1 + NEWS.txt | 5 + pylib/cqlshlib/cql3handling.py | 3 + pylib/cqlshlib/test/test_cqlsh_completion.py | 8 +- pylib/cqlshlib/test/test_cqlsh_output.py | 1 + .../statements/schema/TableAttributes.java | 3 + .../cassandra/db/ColumnFamilyStore.java | 11 +- .../cassandra/schema/SchemaKeyspace.java | 54 +++--- .../cassandra/schema/TableMetadata.java | 6 + .../apache/cassandra/schema/TableParams.java | 18 +- .../test/AllowAutoSnapshotTest.java | 156 ++++++++++++++++++ .../statements/DescribeStatementTest.java | 2 + .../cassandra/db/SchemaCQLHelperTest.java | 1 + .../cassandra/schema/SchemaKeyspaceTest.java | 40 +++++ 14 files changed, 281 insertions(+), 28 deletions(-) create mode 100644 test/distributed/org/apache/cassandra/distributed/test/AllowAutoSnapshotTest.java diff --git a/CHANGES.txt b/CHANGES.txt index c36017f15785..3328974dad12 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Make disabling auto snapshot on selected tables possible (CASSANDRA-10383) * Introduce compaction priorities to prevent upgrade compaction inability to finish (CASSANDRA-17851) * Prevent a user from manually removing ephemeral snapshots (CASSANDRA-17757) * Remove dependency on Maven Ant Tasks (CASSANDRA-17750) diff --git a/NEWS.txt b/NEWS.txt index b488acbf201b..9457c343bb5f 100644 --- a/NEWS.txt +++ b/NEWS.txt @@ -85,6 +85,9 @@ New features isEmptyWithoutStatus() our usage of hbState() + applicationState), however there are other failure cases which block host replacements and require intrusive workarounds and human intervention to recover from when you have something in hbState() you don't expect. See CASSANDRA-17842 for further details. + - Added new CQL table property 'allow_auto_snapshot' which is by default true. When set to false and 'auto_snapshot: true' + in cassandra.yaml, there will be no snapshot taken when a table is truncated or dropped. When auto_snapshot in + casandra.yaml is set to false, the newly added table property does not have any effect. Upgrading --------- @@ -92,6 +95,8 @@ Upgrading there is a dedicated flag in snapshot manifest instead. On upgrade of a node to version 4.2, on node's start, in case there are such ephemeral snapshots on disk, they will be deleted (same behaviour as before) and any new ephemeral snapshots will stop to create ephemeral marker files as flag in a snapshot manifest was introduced instead. + - There was new table property introduced called 'allow_auto_snapshot' (see section 'New features'). Hence, upgraded + node will be on a new schema version. Please do a rolling upgrade of nodes of a cluster to converge to one schema version. Deprecation ----------- diff --git a/pylib/cqlshlib/cql3handling.py b/pylib/cqlshlib/cql3handling.py index 7e123bd67a89..0e14676ad52c 100644 --- a/pylib/cqlshlib/cql3handling.py +++ b/pylib/cqlshlib/cql3handling.py @@ -43,6 +43,7 @@ def __str__(self): class Cql3ParsingRuleSet(CqlParsingRuleSet): columnfamily_layout_options = ( + ('allow_auto_snapshot', None), ('bloom_filter_fp_chance', None), ('comment', None), ('gc_grace_seconds', None), @@ -514,6 +515,8 @@ def cf_prop_val_completer(ctxt, cass): return [Hint('')] if this_opt in ('read_repair'): return [Hint('<\'none\'|\'blocking\'>')] + if this_opt == 'allow_auto_snapshot': + return [Hint('')] return [Hint('')] diff --git a/pylib/cqlshlib/test/test_cqlsh_completion.py b/pylib/cqlshlib/test/test_cqlsh_completion.py index 06f7664712b7..8a2679dbde77 100644 --- a/pylib/cqlshlib/test/test_cqlsh_completion.py +++ b/pylib/cqlshlib/test/test_cqlsh_completion.py @@ -616,7 +616,8 @@ def create_columnfamily_table_template(self, name): self.trycompletions(prefix + ' new_table (col_a int PRIMARY KEY) W', immediate='ITH ') self.trycompletions(prefix + ' new_table (col_a int PRIMARY KEY) WITH ', - choices=['bloom_filter_fp_chance', 'compaction', + choices=['allow_auto_snapshot', + 'bloom_filter_fp_chance', 'compaction', 'compression', 'default_time_to_live', 'gc_grace_seconds', 'max_index_interval', @@ -625,7 +626,8 @@ def create_columnfamily_table_template(self, name): 'COMPACT', 'caching', 'comment', 'min_index_interval', 'speculative_retry', 'additional_write_policy', 'cdc', 'read_repair']) self.trycompletions(prefix + ' new_table (col_a int PRIMARY KEY) WITH ', - choices=['bloom_filter_fp_chance', 'compaction', + choices=['allow_auto_snapshot', + 'bloom_filter_fp_chance', 'compaction', 'compression', 'default_time_to_live', 'gc_grace_seconds', 'max_index_interval', @@ -673,7 +675,7 @@ def create_columnfamily_table_template(self, name): choices=[';', 'AND']) self.trycompletions(prefix + " new_table (col_a int PRIMARY KEY) WITH compaction = " + "{'class': 'SizeTieredCompactionStrategy'} AND ", - choices=['bloom_filter_fp_chance', 'compaction', + choices=['allow_auto_snapshot', 'bloom_filter_fp_chance', 'compaction', 'compression', 'default_time_to_live', 'gc_grace_seconds', 'max_index_interval', diff --git a/pylib/cqlshlib/test/test_cqlsh_output.py b/pylib/cqlshlib/test/test_cqlsh_output.py index c430f46766cd..e54bf242342b 100644 --- a/pylib/cqlshlib/test/test_cqlsh_output.py +++ b/pylib/cqlshlib/test/test_cqlsh_output.py @@ -652,6 +652,7 @@ def test_describe_columnfamily_output(self): varcharcol text, varintcol varint ) WITH additional_write_policy = '99p' + AND allow_auto_snapshot = true AND bloom_filter_fp_chance = 0.01 AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'} AND cdc = false diff --git a/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java b/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java index fd31e43ec0b8..21cf2c984c5b 100644 --- a/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java +++ b/src/java/org/apache/cassandra/cql3/statements/schema/TableAttributes.java @@ -98,6 +98,9 @@ public static Set allKeywords() private TableParams build(TableParams.Builder builder) { + if (hasOption(Option.ALLOW_AUTO_SNAPSHOT)) + builder.allowAutoSnapshot(getBoolean(Option.ALLOW_AUTO_SNAPSHOT.toString(), true)); + if (hasOption(Option.BLOOM_FILTER_FP_CHANCE)) builder.bloomFilterFpChance(getDouble(Option.BLOOM_FILTER_FP_CHANCE)); diff --git a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java index e4b9d781e79f..118069b38d48 100644 --- a/src/java/org/apache/cassandra/db/ColumnFamilyStore.java +++ b/src/java/org/apache/cassandra/db/ColumnFamilyStore.java @@ -2581,7 +2581,7 @@ private void truncateBlocking(boolean noSnapshot) if (!noSnapshot && ((keyspace.getMetadata().params.durableWrites && !memtableWritesAreDurable()) // need to clear dirty regions - || DatabaseDescriptor.isAutoSnapshot())) // need sstable for snapshot + || isAutoSnapshotEnabled())) { replayAfter = forceBlockingFlush(FlushReason.TRUNCATE); viewManager.forceBlockingFlush(FlushReason.TRUNCATE); @@ -2614,7 +2614,7 @@ public void run() "Stopping parent sessions {} due to truncation of tableId="+metadata.id); data.notifyTruncated(truncatedAt); - if (!noSnapshot && DatabaseDescriptor.isAutoSnapshot()) + if (!noSnapshot && isAutoSnapshotEnabled()) snapshot(Keyspace.getTimestampedSnapshotNameWithPrefix(name, SNAPSHOT_TRUNCATE_PREFIX), DatabaseDescriptor.getAutoSnapshotTtl()); discardSSTables(truncatedAt); @@ -3073,6 +3073,11 @@ public boolean isKeyCacheEnabled() return metadata().params.caching.cacheKeys() && CacheService.instance.keyCache.getCapacity() > 0; } + public boolean isAutoSnapshotEnabled() + { + return metadata().params.allowAutoSnapshot && DatabaseDescriptor.isAutoSnapshot(); + } + /** * Discard all SSTables that were created before given timestamp. * @@ -3207,7 +3212,7 @@ void onTableDropped() CompactionManager.instance.interruptCompactionForCFs(concatWithIndexes(), (sstable) -> true, true); - if (DatabaseDescriptor.isAutoSnapshot()) + if (isAutoSnapshotEnabled()) snapshot(Keyspace.getTimestampedSnapshotNameWithPrefix(name, ColumnFamilyStore.SNAPSHOT_DROP_PREFIX), DatabaseDescriptor.getAutoSnapshotTtl()); CommitLog.instance.forceRecycleAllSegments(Collections.singleton(metadata.id)); diff --git a/src/java/org/apache/cassandra/schema/SchemaKeyspace.java b/src/java/org/apache/cassandra/schema/SchemaKeyspace.java index 33d2b7d6c8f9..cbcba70caa5c 100644 --- a/src/java/org/apache/cassandra/schema/SchemaKeyspace.java +++ b/src/java/org/apache/cassandra/schema/SchemaKeyspace.java @@ -100,6 +100,7 @@ private SchemaKeyspace() "CREATE TABLE %s (" + "keyspace_name text," + "table_name text," + + "allow_auto_snapshot boolean," + "bloom_filter_fp_chance double," + "caching frozen>," + "comment text," @@ -168,6 +169,7 @@ private SchemaKeyspace() + "base_table_id uuid," + "base_table_name text," + "where_clause text," + + "allow_auto_snapshot boolean," + "bloom_filter_fp_chance double," + "caching frozen>," + "comment text," @@ -563,6 +565,11 @@ private static void addTableParamsToRowBuilder(TableParams params, Row.SimpleBui // in mixed operation with pre-4.1 versioned node during upgrades. if (params.memtable != MemtableParams.DEFAULT) builder.add("memtable", params.memtable.configurationKey()); + + // As above, only add the allow_auto_snapshot column if the value is not default (true) and + // auto-snapshotting is enabled, to avoid RTE in pre-4.2 versioned node during upgrades + if (!params.allowAutoSnapshot) + builder.add("allow_auto_snapshot", false); } private static void addAlterTableToSchemaMutation(TableMetadata oldTable, TableMetadata newTable, Mutation.SimpleBuilder builder) @@ -954,27 +961,32 @@ private static TableMetadata fetchTable(String keyspaceName, String tableName, T @VisibleForTesting static TableParams createTableParamsFromRow(UntypedResultSet.Row row) { - return TableParams.builder() - .bloomFilterFpChance(row.getDouble("bloom_filter_fp_chance")) - .caching(CachingParams.fromMap(row.getFrozenTextMap("caching"))) - .comment(row.getString("comment")) - .compaction(CompactionParams.fromMap(row.getFrozenTextMap("compaction"))) - .compression(CompressionParams.fromMap(row.getFrozenTextMap("compression"))) - .memtable(MemtableParams.get(row.has("memtable") ? row.getString("memtable") : null)) // memtable column was introduced in 4.1 - .defaultTimeToLive(row.getInt("default_time_to_live")) - .extensions(row.getFrozenMap("extensions", UTF8Type.instance, BytesType.instance)) - .gcGraceSeconds(row.getInt("gc_grace_seconds")) - .maxIndexInterval(row.getInt("max_index_interval")) - .memtableFlushPeriodInMs(row.getInt("memtable_flush_period_in_ms")) - .minIndexInterval(row.getInt("min_index_interval")) - .crcCheckChance(row.getDouble("crc_check_chance")) - .speculativeRetry(SpeculativeRetryPolicy.fromString(row.getString("speculative_retry"))) - .additionalWritePolicy(row.has("additional_write_policy") ? - SpeculativeRetryPolicy.fromString(row.getString("additional_write_policy")) : - SpeculativeRetryPolicy.fromString("99PERCENTILE")) - .cdc(row.has("cdc") && row.getBoolean("cdc")) - .readRepair(getReadRepairStrategy(row)) - .build(); + TableParams.Builder builder = TableParams.builder() + .bloomFilterFpChance(row.getDouble("bloom_filter_fp_chance")) + .caching(CachingParams.fromMap(row.getFrozenTextMap("caching"))) + .comment(row.getString("comment")) + .compaction(CompactionParams.fromMap(row.getFrozenTextMap("compaction"))) + .compression(CompressionParams.fromMap(row.getFrozenTextMap("compression"))) + .memtable(MemtableParams.get(row.has("memtable") ? row.getString("memtable") : null)) // memtable column was introduced in 4.1 + .defaultTimeToLive(row.getInt("default_time_to_live")) + .extensions(row.getFrozenMap("extensions", UTF8Type.instance, BytesType.instance)) + .gcGraceSeconds(row.getInt("gc_grace_seconds")) + .maxIndexInterval(row.getInt("max_index_interval")) + .memtableFlushPeriodInMs(row.getInt("memtable_flush_period_in_ms")) + .minIndexInterval(row.getInt("min_index_interval")) + .crcCheckChance(row.getDouble("crc_check_chance")) + .speculativeRetry(SpeculativeRetryPolicy.fromString(row.getString("speculative_retry"))) + .additionalWritePolicy(row.has("additional_write_policy") ? + SpeculativeRetryPolicy.fromString(row.getString("additional_write_policy")) : + SpeculativeRetryPolicy.fromString("99PERCENTILE")) + .cdc(row.has("cdc") && row.getBoolean("cdc")) + .readRepair(getReadRepairStrategy(row)); + + // allow_auto_snapshot column was introduced in 4.2 + if (row.has("allow_auto_snapshot")) + builder.allowAutoSnapshot(row.getBoolean("allow_auto_snapshot")); + + return builder.build(); } private static List fetchColumns(String keyspace, String table, Types types) diff --git a/src/java/org/apache/cassandra/schema/TableMetadata.java b/src/java/org/apache/cassandra/schema/TableMetadata.java index 2e9d507ba9d9..98fb0540c7af 100644 --- a/src/java/org/apache/cassandra/schema/TableMetadata.java +++ b/src/java/org/apache/cassandra/schema/TableMetadata.java @@ -777,6 +777,12 @@ public Builder params(TableParams val) return this; } + public Builder allowAutoSnapshot(boolean val) + { + params.allowAutoSnapshot(val); + return this; + } + public Builder bloomFilterFpChance(double val) { params.bloomFilterFpChance(val); diff --git a/src/java/org/apache/cassandra/schema/TableParams.java b/src/java/org/apache/cassandra/schema/TableParams.java index 440729c4ef67..920bb829fc78 100644 --- a/src/java/org/apache/cassandra/schema/TableParams.java +++ b/src/java/org/apache/cassandra/schema/TableParams.java @@ -41,6 +41,7 @@ public final class TableParams { public enum Option { + ALLOW_AUTO_SNAPSHOT, BLOOM_FILTER_FP_CHANCE, CACHING, COMMENT, @@ -67,6 +68,7 @@ public String toString() } public final String comment; + public final boolean allowAutoSnapshot; public final double bloomFilterFpChance; public final double crcCheckChance; public final int gcGraceSeconds; @@ -87,6 +89,7 @@ public String toString() private TableParams(Builder builder) { comment = builder.comment; + allowAutoSnapshot = builder.allowAutoSnapshot; bloomFilterFpChance = builder.bloomFilterFpChance == null ? builder.compaction.defaultBloomFilterFbChance() : builder.bloomFilterFpChance; @@ -114,7 +117,8 @@ public static Builder builder() public static Builder builder(TableParams params) { - return new Builder().bloomFilterFpChance(params.bloomFilterFpChance) + return new Builder().allowAutoSnapshot(params.allowAutoSnapshot) + .bloomFilterFpChance(params.bloomFilterFpChance) .caching(params.caching) .comment(params.comment) .compaction(params.compaction) @@ -204,6 +208,7 @@ public boolean equals(Object o) TableParams p = (TableParams) o; return comment.equals(p.comment) + && allowAutoSnapshot == p.allowAutoSnapshot && bloomFilterFpChance == p.bloomFilterFpChance && crcCheckChance == p.crcCheckChance && gcGraceSeconds == p.gcGraceSeconds @@ -225,6 +230,7 @@ public boolean equals(Object o) public int hashCode() { return Objects.hashCode(comment, + allowAutoSnapshot, bloomFilterFpChance, crcCheckChance, gcGraceSeconds, @@ -247,6 +253,7 @@ public String toString() { return MoreObjects.toStringHelper(this) .add(Option.COMMENT.toString(), comment) + .add(Option.ALLOW_AUTO_SNAPSHOT.toString(), allowAutoSnapshot) .add(Option.BLOOM_FILTER_FP_CHANCE.toString(), bloomFilterFpChance) .add(Option.CRC_CHECK_CHANCE.toString(), crcCheckChance) .add(Option.GC_GRACE_SECONDS.toString(), gcGraceSeconds) @@ -269,6 +276,8 @@ public void appendCqlTo(CqlBuilder builder, boolean isView) { // option names should be in alphabetical order builder.append("additional_write_policy = ").appendWithSingleQuotes(additionalWritePolicy.toString()) + .newLine() + .append("AND allow_auto_snapshot = ").append(allowAutoSnapshot) .newLine() .append("AND bloom_filter_fp_chance = ").append(bloomFilterFpChance) .newLine() @@ -315,6 +324,7 @@ public void appendCqlTo(CqlBuilder builder, boolean isView) public static final class Builder { private String comment = ""; + private boolean allowAutoSnapshot = true; private Double bloomFilterFpChance; private double crcCheckChance = 1.0; private int gcGraceSeconds = 864000; // 10 days @@ -347,6 +357,12 @@ public Builder comment(String val) return this; } + public Builder allowAutoSnapshot(boolean val) + { + allowAutoSnapshot = val; + return this; + } + public Builder bloomFilterFpChance(double val) { bloomFilterFpChance = val; diff --git a/test/distributed/org/apache/cassandra/distributed/test/AllowAutoSnapshotTest.java b/test/distributed/org/apache/cassandra/distributed/test/AllowAutoSnapshotTest.java new file mode 100644 index 000000000000..7a94905abb15 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/AllowAutoSnapshotTest.java @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test; + +import java.io.IOException; +import java.util.Collections; +import java.util.function.Predicate; +import java.util.stream.Stream; + +import org.junit.Test; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.IIsolatedExecutor; +import org.apache.cassandra.service.StorageService; + +import static org.apache.cassandra.distributed.Cluster.build; +import static org.apache.cassandra.distributed.api.ConsistencyLevel.ALL; +import static org.apache.cassandra.distributed.api.Feature.GOSSIP; +import static org.awaitility.Awaitility.await; + +public class AllowAutoSnapshotTest extends TestBaseImpl +{ + @Test + public void testAllowAutoSnapshotOnAutoSnapshotEnabled() throws Exception + { + try (Cluster c = getCluster(true)) + { + c.schemaChange(withKeyspace("CREATE TABLE %s.test_table (a text primary key, b int) WITH allow_auto_snapshot = true")); + c.schemaChange(withKeyspace("DROP TABLE %s.test_table;")); + + checkSnapshots(c, true, "test_table", "dropped"); + + c.schemaChange(withKeyspace("CREATE TABLE %s.test_table2 (a text primary key, b int) WITH allow_auto_snapshot = false")); + c.schemaChange(withKeyspace("DROP TABLE %s.test_table2;")); + + checkSnapshots(c, false, "test_table2", "dropped"); + } + } + + @Test + public void testAllowAutoSnapshotOnAutoSnapshotDisabled() throws Exception + { + try (Cluster c = getCluster(false)) + { + c.schemaChange(withKeyspace("CREATE TABLE %s.test_table (a text primary key, b int) WITH allow_auto_snapshot = true")); + c.schemaChange(withKeyspace("DROP TABLE %s.test_table;")); + + checkSnapshots(c, false, "test_table", "dropped"); + + c.schemaChange(withKeyspace("CREATE TABLE %s.test_table2 (a text primary key, b int) WITH allow_auto_snapshot = false")); + c.schemaChange(withKeyspace("DROP TABLE %s.test_table2;")); + + checkSnapshots(c, false, "test_table2", "dropped"); + } + } + + @Test + public void testDisableAndEnableAllowAutoSnapshot() throws Exception + { + try (Cluster c = getCluster(true)) + { + c.schemaChange(withKeyspace("CREATE TABLE %s.test_table (a text primary key, b int) WITH allow_auto_snapshot = true")); + + c.schemaChange(withKeyspace("ALTER TABLE %s.test_table WITH allow_auto_snapshot = false")); + c.schemaChange(withKeyspace("ALTER TABLE %s.test_table WITH allow_auto_snapshot = true")); + + c.schemaChange(withKeyspace("DROP TABLE %s.test_table;")); + + checkSnapshots(c, true, "test_table", "dropped"); + } + } + + @Test + public void testTruncateAllowAutoSnapshot() throws Exception + { + try (Cluster c = getCluster(true)) + { + // allow_auto_snapshot = true + c.schemaChange(withKeyspace("CREATE TABLE %s.test_table (a text primary key, b int) WITH allow_auto_snapshot = true")); + + c.coordinator(1).execute(withKeyspace("INSERT INTO %s.test_table (a, b) VALUES ('a', 1);"), ALL); + + c.schemaChange(withKeyspace("TRUNCATE TABLE %s.test_table;")); + + checkSnapshots(c, true, "test_table", "truncated"); + + // allow_auto_snapshot = false + c.schemaChange(withKeyspace("CREATE TABLE %s.test_table2 (a text primary key, b int) WITH allow_auto_snapshot = false")); + + c.coordinator(1).execute(withKeyspace("INSERT INTO %s.test_table2 (a, b) VALUES ('a', 1);"), ALL); + + c.schemaChange(withKeyspace("TRUNCATE TABLE %s.test_table2;")); + + checkSnapshots(c, false, "test_table2", "truncated"); + } + } + + @Test + public void testMaterializedViewAllowAutoSnapshot() throws Exception + { + try (Cluster c = getCluster(true)) + { + // materialized view allow_auto_snapshot = false + c.schemaChange(withKeyspace("CREATE TABLE %s.t (k int, c1 int, c2 int, v1 int, v2 int, PRIMARY KEY (k, c1, c2))")); + c.schemaChange(withKeyspace("CREATE MATERIALIZED VIEW %s.mv1 AS SELECT * FROM t WHERE k IS NOT NULL AND c1 IS NOT NULL AND c2 IS NOT NULL PRIMARY KEY (c1, k, c2) WITH allow_auto_snapshot = false")); + + c.coordinator(1).execute(withKeyspace("INSERT INTO %s.t (k, c1, c2, v1, v2) VALUES (1, 2, 3, 4, 5);"), ALL); + + c.schemaChange(withKeyspace("DROP MATERIALIZED VIEW %s.mv1;")); + + checkSnapshots(c, false, "mv1", "dropped"); + + // materialized view allow_auto_snapshot = true + c.schemaChange(withKeyspace("CREATE MATERIALIZED VIEW %s.mv1 AS SELECT * FROM t WHERE k IS NOT NULL AND c1 IS NOT NULL AND c2 IS NOT NULL PRIMARY KEY (c1, k, c2) WITH allow_auto_snapshot = true")); + + c.schemaChange(withKeyspace("DROP MATERIALIZED VIEW %s.mv1;")); + checkSnapshots(c, true, "mv1", "dropped"); + } + } + + private Cluster getCluster(boolean autoSnapshotEnabled) throws IOException + { + return init(build(2).withConfig(c -> c.with(GOSSIP) + .set("auto_snapshot", autoSnapshotEnabled) + .set("materialized_views_enabled", true)).start()); + } + + private void checkSnapshots(Cluster cluster, boolean shouldContain, String table, String snapshotPrefix) + { + for (int i = 1; i < cluster.size() + 1; i++) + { + final int node = i; // has to be effectively final for the usage in "until" method + await().until(() -> cluster.get(node).appliesOnInstance((IIsolatedExecutor.SerializableTriFunction) (shouldContainSnapshot, tableName, prefix) -> { + Stream stream = StorageService.instance.getSnapshotDetails(Collections.emptyMap()).keySet().stream(); + Predicate predicate = tag -> tag.startsWith(prefix + '-') && tag.endsWith('-' + tableName); + return shouldContainSnapshot ? stream.anyMatch(predicate) : stream.noneMatch(predicate); + }).apply(shouldContain, table, snapshotPrefix)); + } + } +} diff --git a/test/unit/org/apache/cassandra/cql3/statements/DescribeStatementTest.java b/test/unit/org/apache/cassandra/cql3/statements/DescribeStatementTest.java index 9ea04b112e0a..83b320c5c34d 100644 --- a/test/unit/org/apache/cassandra/cql3/statements/DescribeStatementTest.java +++ b/test/unit/org/apache/cassandra/cql3/statements/DescribeStatementTest.java @@ -839,6 +839,7 @@ private static String testTableOutput() private static String tableParametersCql() { return "additional_write_policy = '99p'\n" + + " AND allow_auto_snapshot = true\n" + " AND bloom_filter_fp_chance = 0.01\n" + " AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}\n" + " AND cdc = false\n" + @@ -860,6 +861,7 @@ private static String tableParametersCql() private static String mvParametersCql() { return "additional_write_policy = '99p'\n" + + " AND allow_auto_snapshot = true\n" + " AND bloom_filter_fp_chance = 0.01\n" + " AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}\n" + " AND cdc = false\n" + diff --git a/test/unit/org/apache/cassandra/db/SchemaCQLHelperTest.java b/test/unit/org/apache/cassandra/db/SchemaCQLHelperTest.java index 63ef86110177..ef62acc160af 100644 --- a/test/unit/org/apache/cassandra/db/SchemaCQLHelperTest.java +++ b/test/unit/org/apache/cassandra/db/SchemaCQLHelperTest.java @@ -297,6 +297,7 @@ public void testCfmOptionsCQL() assertThat(SchemaCQLHelper.getTableMetadataAsCQL(cfs.metadata(), cfs.keyspace.getMetadata()), containsString("CLUSTERING ORDER BY (cl1 ASC)\n" + " AND additional_write_policy = 'ALWAYS'\n" + + " AND allow_auto_snapshot = true\n" + " AND bloom_filter_fp_chance = 1.0\n" + " AND caching = {'keys': 'ALL', 'rows_per_partition': 'NONE'}\n" + " AND cdc = false\n" + diff --git a/test/unit/org/apache/cassandra/schema/SchemaKeyspaceTest.java b/test/unit/org/apache/cassandra/schema/SchemaKeyspaceTest.java index e80773047acb..77071d6d0e6e 100644 --- a/test/unit/org/apache/cassandra/schema/SchemaKeyspaceTest.java +++ b/test/unit/org/apache/cassandra/schema/SchemaKeyspaceTest.java @@ -24,6 +24,7 @@ import java.util.Collections; import java.util.HashSet; import java.util.Set; +import java.util.UUID; import java.util.concurrent.CyclicBarrier; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -33,11 +34,13 @@ import com.google.common.collect.ImmutableMap; import org.junit.Assert; +import org.junit.Assume; import org.junit.BeforeClass; import org.junit.Test; import org.junit.runner.RunWith; import org.apache.cassandra.SchemaLoader; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.QueryProcessor; import org.apache.cassandra.cql3.UntypedResultSet; import org.apache.cassandra.cql3.statements.schema.CreateTableStatement; @@ -61,6 +64,7 @@ import static org.apache.cassandra.cql3.QueryProcessor.executeOnceInternal; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertTrue; +import static org.junit.Assert.assertFalse; @RunWith(BMUnitRunner.class) public class SchemaKeyspaceTest @@ -181,6 +185,42 @@ public void testReadRepair() } + @Test + public void testAutoSnapshotEnabledOnTable() + { + Assume.assumeTrue(DatabaseDescriptor.isAutoSnapshot()); + String keyspaceName = "AutoSnapshot"; + String tableName = "table1"; + + createTable(keyspaceName, "CREATE TABLE " + tableName + " (a text primary key, b int) WITH allow_auto_snapshot = true"); + + ColumnFamilyStore cfs = Keyspace.open(keyspaceName).getColumnFamilyStore(tableName); + + assertTrue(cfs.isAutoSnapshotEnabled()); + + SchemaTestUtil.announceTableDrop(keyspaceName, tableName); + + assertFalse(cfs.listSnapshots().isEmpty()); + } + + @Test + public void testAutoSnapshotDisabledOnTable() + { + Assume.assumeTrue(DatabaseDescriptor.isAutoSnapshot()); + String keyspaceName = "AutoSnapshot"; + String tableName = "table2"; + + createTable(keyspaceName, "CREATE TABLE " + tableName + " (a text primary key, b int) WITH allow_auto_snapshot = false"); + + ColumnFamilyStore cfs = Keyspace.open(keyspaceName).getColumnFamilyStore(tableName); + + assertFalse(cfs.isAutoSnapshotEnabled()); + + SchemaTestUtil.announceTableDrop(keyspaceName, tableName); + + assertTrue(cfs.listSnapshots().isEmpty()); + } + private static void updateTable(String keyspace, TableMetadata oldTable, TableMetadata newTable) { KeyspaceMetadata ksm = Schema.instance.getKeyspaceInstance(keyspace).getMetadata(); From d6aee7e08c658db9d394a6b7e3e27791b4d6854f Mon Sep 17 00:00:00 2001 From: Caleb Rackliffe Date: Thu, 1 Sep 2022 11:47:22 -0500 Subject: [PATCH 062/159] Optionally avoid hint transfer during decommission patch by Caleb Rackliffe; reviewed by Jon Meredith and Stefan Miklosovic for CASSANDRA-17808 --- CHANGES.txt | 1 + conf/cassandra.yaml | 5 +++ .../org/apache/cassandra/config/Config.java | 1 + .../cassandra/config/DatabaseDescriptor.java | 10 +++++ .../hints/HintsDispatchExecutor.java | 20 ++++++--- .../cassandra/service/StorageService.java | 28 ++++++++++-- .../service/StorageServiceMBean.java | 3 ++ .../test/HintedHandoffAddRemoveNodesTest.java | 45 ++++++++++++++++++- 8 files changed, 102 insertions(+), 11 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 3328974dad12..e4566fa72395 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Optionally avoid hint transfer during decommission (CASSANDRA-17808) * Make disabling auto snapshot on selected tables possible (CASSANDRA-10383) * Introduce compaction priorities to prevent upgrade compaction inability to finish (CASSANDRA-17851) * Prevent a user from manually removing ephemeral snapshots (CASSANDRA-17757) diff --git a/conf/cassandra.yaml b/conf/cassandra.yaml index 55c8b5a75686..93a581b058c8 100644 --- a/conf/cassandra.yaml +++ b/conf/cassandra.yaml @@ -99,6 +99,11 @@ max_hints_file_size: 128MiB # Disable the option in order to preserve those hints on the disk. auto_hints_cleanup_enabled: false +# Enable/disable transfering hints to a peer during decommission. Even when enabled, this does not guarantee +# consistency for logged batches, and it may delay decommission when coupled with a strict hinted_handoff_throttle. +# Default: true +# transfer_hints_on_decommission: true + # Compression to apply to the hint files. If omitted, hints files # will be written uncompressed. LZ4, Snappy, and Deflate compressors # are supported. diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java index 028179475975..c3a406b455d2 100644 --- a/src/java/org/apache/cassandra/config/Config.java +++ b/src/java/org/apache/cassandra/config/Config.java @@ -428,6 +428,7 @@ public MemtableOptions() public ParameterizedClass hints_compression; public volatile boolean auto_hints_cleanup_enabled = false; + public volatile boolean transfer_hints_on_decommission = true; public volatile boolean incremental_backups = false; public boolean trickle_fsync = false; diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java index 42b5e27fbaa1..482e95fa7560 100644 --- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java +++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java @@ -3157,6 +3157,16 @@ public static void setAutoHintsCleanupEnabled(boolean value) conf.auto_hints_cleanup_enabled = value; } + public static boolean getTransferHintsOnDecommission() + { + return conf.transfer_hints_on_decommission; + } + + public static void setTransferHintsOnDecommission(boolean enabled) + { + conf.transfer_hints_on_decommission = enabled; + } + public static boolean isIncrementalBackupsEnabled() { return conf.incremental_backups; diff --git a/src/java/org/apache/cassandra/hints/HintsDispatchExecutor.java b/src/java/org/apache/cassandra/hints/HintsDispatchExecutor.java index 0f34db61610e..540f5bd85dc7 100644 --- a/src/java/org/apache/cassandra/hints/HintsDispatchExecutor.java +++ b/src/java/org/apache/cassandra/hints/HintsDispatchExecutor.java @@ -192,7 +192,7 @@ public void run() private boolean transfer(UUID hostId) { catalog.stores() - .map(store -> new DispatchHintsTask(store, hostId)) + .map(store -> new DispatchHintsTask(store, hostId, true)) .forEach(Runnable::run); return !catalog.hasFiles(); @@ -205,21 +205,27 @@ private final class DispatchHintsTask implements Runnable private final UUID hostId; private final RateLimiter rateLimiter; - DispatchHintsTask(HintsStore store, UUID hostId) + DispatchHintsTask(HintsStore store, UUID hostId, boolean isTransfer) { this.store = store; this.hostId = hostId; - // rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml). - // max rate is scaled by the number of nodes in the cluster (CASSANDRA-5272). - // the goal is to bound maximum hints traffic going towards a particular node from the rest of the cluster, - // not total outgoing hints traffic from this node - this is why the rate limiter is not shared between + // Rate limit is in bytes per second. Uses Double.MAX_VALUE if disabled (set to 0 in cassandra.yaml). + // Max rate is scaled by the number of nodes in the cluster (CASSANDRA-5272), unless we are transferring + // hints during decomission rather than dispatching them to their final destination. + // The goal is to bound maximum hints traffic going towards a particular node from the rest of the cluster, + // not total outgoing hints traffic from this node. This is why the rate limiter is not shared between // all the dispatch tasks (as there will be at most one dispatch task for a particular host id at a time). - int nodesCount = Math.max(1, StorageService.instance.getTokenMetadata().getAllEndpoints().size() - 1); + int nodesCount = isTransfer ? 1 : Math.max(1, StorageService.instance.getTokenMetadata().getAllEndpoints().size() - 1); double throttleInBytes = DatabaseDescriptor.getHintedHandoffThrottleInKiB() * 1024.0 / nodesCount; this.rateLimiter = RateLimiter.create(throttleInBytes == 0 ? Double.MAX_VALUE : throttleInBytes); } + DispatchHintsTask(HintsStore store, UUID hostId) + { + this(store, hostId, false); + } + public void run() { try diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java index f254c3422190..47fad4a7e09b 100644 --- a/src/java/org/apache/cassandra/service/StorageService.java +++ b/src/java/org/apache/cassandra/service/StorageService.java @@ -312,7 +312,7 @@ public Collection> getPrimaryRangesWithinDC(String keyspace) /* the probability for tracing any particular request, 0 disables tracing and 1 enables for all */ private double traceProbability = 0.0; - private enum Mode { STARTING, NORMAL, JOINING, LEAVING, DECOMMISSIONED, MOVING, DRAINING, DRAINED } + public enum Mode { STARTING, NORMAL, JOINING, LEAVING, DECOMMISSIONED, MOVING, DRAINING, DRAINED } private volatile Mode operationMode = Mode.STARTING; /* Used for tracking drain progress */ @@ -4980,9 +4980,20 @@ private void unbootstrap(Runnable onFinish) throws ExecutionException, Interrupt logger.debug("waiting for batch log processing."); batchlogReplay.get(); - setMode(Mode.LEAVING, "streaming hints to other nodes", true); + Future hintsSuccess = ImmediateFuture.success(null); - Future hintsSuccess = streamHints(); + if (DatabaseDescriptor.getTransferHintsOnDecommission()) + { + setMode(Mode.LEAVING, "streaming hints to other nodes", true); + hintsSuccess = streamHints(); + } + else + { + setMode(Mode.LEAVING, "pausing dispatch and deleting hints", true); + DatabaseDescriptor.setHintedHandoffEnabled(false); + HintsService.instance.pauseDispatch(); + HintsService.instance.deleteAllHints(); + } // wait for the transfer runnables to signal the latch. logger.debug("waiting for stream acks."); @@ -6305,6 +6316,17 @@ public void setHintedHandoffThrottleInKB(int throttleInKB) logger.info("updated hinted_handoff_throttle to {} KiB", throttleInKB); } + public boolean getTransferHintsOnDecommission() + { + return DatabaseDescriptor.getTransferHintsOnDecommission(); + } + + public void setTransferHintsOnDecommission(boolean enabled) + { + DatabaseDescriptor.setTransferHintsOnDecommission(enabled); + logger.info("updated transfer_hints_on_decommission to {}", enabled); + } + @Override public void clearConnectionHistory() { diff --git a/src/java/org/apache/cassandra/service/StorageServiceMBean.java b/src/java/org/apache/cassandra/service/StorageServiceMBean.java index 02485274a385..7e512cdaf7f6 100644 --- a/src/java/org/apache/cassandra/service/StorageServiceMBean.java +++ b/src/java/org/apache/cassandra/service/StorageServiceMBean.java @@ -886,6 +886,9 @@ default int upgradeSSTables(String keyspaceName, boolean excludeCurrentVersion, /** Sets the hinted handoff throttle in KiB per second, per delivery thread. */ public void setHintedHandoffThrottleInKB(int throttleInKB); + public boolean getTransferHintsOnDecommission(); + public void setTransferHintsOnDecommission(boolean enabled); + /** * Resume bootstrap streaming when there is failed data streaming. * diff --git a/test/distributed/org/apache/cassandra/distributed/test/HintedHandoffAddRemoveNodesTest.java b/test/distributed/org/apache/cassandra/distributed/test/HintedHandoffAddRemoveNodesTest.java index 5cf1ab66dfc2..add6fdf500d6 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/HintedHandoffAddRemoveNodesTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/HintedHandoffAddRemoveNodesTest.java @@ -23,18 +23,22 @@ import org.apache.cassandra.distributed.Cluster; import org.apache.cassandra.distributed.action.GossipHelper; +import org.apache.cassandra.distributed.api.ConsistencyLevel; import org.apache.cassandra.distributed.api.IInvokableInstance; import org.apache.cassandra.distributed.api.TokenSupplier; import org.apache.cassandra.distributed.shared.NetworkTopology; -import org.apache.cassandra.distributed.api.ConsistencyLevel; +import org.apache.cassandra.metrics.HintsServiceMetrics; import org.apache.cassandra.metrics.StorageMetrics; import org.apache.cassandra.service.StorageService; import static java.util.concurrent.TimeUnit.SECONDS; +import static org.assertj.core.api.AssertionsForClassTypes.assertThat; import static org.awaitility.Awaitility.await; import static org.junit.Assert.assertEquals; import static org.apache.cassandra.distributed.action.GossipHelper.decommission; +import static org.apache.cassandra.distributed.api.ConsistencyLevel.ALL; +import static org.apache.cassandra.distributed.api.ConsistencyLevel.TWO; import static org.apache.cassandra.distributed.api.Feature.GOSSIP; import static org.apache.cassandra.distributed.api.Feature.NATIVE_PROTOCOL; import static org.apache.cassandra.distributed.api.Feature.NETWORK; @@ -46,6 +50,39 @@ */ public class HintedHandoffAddRemoveNodesTest extends TestBaseImpl { + @SuppressWarnings("Convert2MethodRef") + @Test + public void shouldAvoidHintTransferOnDecommission() throws Exception + { + try (Cluster cluster = init(builder().withNodes(3) + .withConfig(config -> config.set("transfer_hints_on_decommission", false).with(GOSSIP)) + .withoutVNodes() + .start())) + { + cluster.schemaChange(withKeyspace("CREATE TABLE %s.decom_no_hints_test (key int PRIMARY KEY, value int)")); + + cluster.coordinator(1).execute(withKeyspace("INSERT INTO %s.decom_no_hints_test (key, value) VALUES (?, ?)"), ALL, 0, 0); + long hintsBeforeShutdown = countTotalHints(cluster.get(1)); + assertThat(hintsBeforeShutdown).isEqualTo(0); + long hintsDelivered = countHintsDelivered(cluster.get(1)); + assertThat(hintsDelivered).isEqualTo(0); + + // Shutdown node 3 so hints can be written against it. + cluster.get(3).shutdown().get(); + + cluster.coordinator(1).execute(withKeyspace("INSERT INTO %s.decom_no_hints_test (key, value) VALUES (?, ?)"), TWO, 0, 0); + Awaitility.await().until(() -> countTotalHints(cluster.get(1)) > 0); + long hintsAfterShutdown = countTotalHints(cluster.get(1)); + assertThat(hintsAfterShutdown).isEqualTo(1); + + cluster.get(1).nodetoolResult("decommission", "--force").asserts().success(); + long hintsDeliveredByDecom = countHintsDelivered(cluster.get(1)); + String mode = cluster.get(1).callOnInstance(() -> StorageService.instance.getOperationMode()); + assertEquals(StorageService.Mode.DECOMMISSIONED.toString(), mode); + assertThat(hintsDeliveredByDecom).isEqualTo(0); + } + } + /** * Replaces Python dtest {@code hintedhandoff_test.py:TestHintedHandoff.test_hintedhandoff_decom()}. */ @@ -130,6 +167,12 @@ private long countTotalHints(IInvokableInstance instance) return instance.callOnInstance(() -> StorageMetrics.totalHints.getCount()); } + @SuppressWarnings("Convert2MethodRef") + private long countHintsDelivered(IInvokableInstance instance) + { + return instance.callOnInstance(() -> HintsServiceMetrics.hintsSucceeded.getCount()); + } + @SuppressWarnings("SameParameterValue") private void populate(Cluster cluster, String table, int coordinator, int start, int count, ConsistencyLevel cl) { From 51944c5e68bfcee0c2c8e2aeb6b572eae0167965 Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Wed, 7 Sep 2022 13:12:22 -0400 Subject: [PATCH 063/159] Make sure preview repairs don't optimise streams unless configured to Patch by Chris Lohfink; reviewed by Josh McKenzie and Marcus Eriksson for CASSANDRA-17865 Co-authored-by: Chris Lohfink Co-authored-by: Josh McKenzie --- CHANGES.txt | 1 + .../repair/messages/RepairOption.java | 20 +++--- .../repair/messages/RepairOptionTest.java | 67 +++++++++++++++++-- 3 files changed, 73 insertions(+), 15 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index e4566fa72395..9bf2063066ce 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Make sure preview repairs don't optimise streams unless configured to (CASSANDRA-17865) * Optionally avoid hint transfer during decommission (CASSANDRA-17808) * Make disabling auto snapshot on selected tables possible (CASSANDRA-10383) * Introduce compaction priorities to prevent upgrade compaction inability to finish (CASSANDRA-17851) diff --git a/src/java/org/apache/cassandra/repair/messages/RepairOption.java b/src/java/org/apache/cassandra/repair/messages/RepairOption.java index 6bb7fdb61fee..f0508a3e4226 100644 --- a/src/java/org/apache/cassandra/repair/messages/RepairOption.java +++ b/src/java/org/apache/cassandra/repair/messages/RepairOption.java @@ -395,22 +395,20 @@ public boolean isInLocalDCOnly() public boolean optimiseStreams() { - if(optimiseStreams) - return true; - - if (isPullRepair() || isForcedRepair()) + if (isPullRepair()) return false; - if (isIncremental() && DatabaseDescriptor.autoOptimiseIncRepairStreams()) - return true; - - if (isPreview() && DatabaseDescriptor.autoOptimisePreviewRepairStreams()) + if (isPreview()) + { + if (DatabaseDescriptor.autoOptimisePreviewRepairStreams()) + return true; + } + else if (isIncremental() && DatabaseDescriptor.autoOptimiseIncRepairStreams()) return true; - - if (!isIncremental() && DatabaseDescriptor.autoOptimiseFullRepairStreams()) + else if (!isIncremental() && DatabaseDescriptor.autoOptimiseFullRepairStreams()) return true; - return false; + return optimiseStreams; } public boolean ignoreUnreplicatedKeyspaces() diff --git a/test/unit/org/apache/cassandra/repair/messages/RepairOptionTest.java b/test/unit/org/apache/cassandra/repair/messages/RepairOptionTest.java index a6ca084c28e0..0483fcf15c41 100644 --- a/test/unit/org/apache/cassandra/repair/messages/RepairOptionTest.java +++ b/test/unit/org/apache/cassandra/repair/messages/RepairOptionTest.java @@ -23,7 +23,6 @@ import java.util.Map; import java.util.Set; -import org.junit.Assert; import org.junit.Test; import org.apache.cassandra.config.DatabaseDescriptor; @@ -32,6 +31,7 @@ import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; import org.apache.cassandra.repair.RepairParallelism; +import org.apache.cassandra.streaming.PreviewKind; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -149,17 +149,76 @@ public void testForceOption() throws Exception // default value option = RepairOption.parse(options, Murmur3Partitioner.instance); - Assert.assertFalse(option.isForcedRepair()); + assertFalse(option.isForcedRepair()); // explicit true options.put(RepairOption.FORCE_REPAIR_KEY, "true"); option = RepairOption.parse(options, Murmur3Partitioner.instance); - Assert.assertTrue(option.isForcedRepair()); + assertTrue(option.isForcedRepair()); // explicit false options.put(RepairOption.FORCE_REPAIR_KEY, "false"); option = RepairOption.parse(options, Murmur3Partitioner.instance); - Assert.assertFalse(option.isForcedRepair()); + assertFalse(option.isForcedRepair()); + } + + @Test + public void testOptimiseStreams() + { + boolean optFull = DatabaseDescriptor.autoOptimiseFullRepairStreams(); + boolean optInc = DatabaseDescriptor.autoOptimiseIncRepairStreams(); + boolean optPreview = DatabaseDescriptor.autoOptimisePreviewRepairStreams(); + try + { + for (PreviewKind previewKind : PreviewKind.values()) + for (boolean inc : new boolean[] {true, false}) + assertOptimise(previewKind, inc); + } + finally + { + setOptimise(optFull, optInc, optPreview); + } + } + + private void assertHelper(Map options, boolean full, boolean inc, boolean preview, boolean expected) + { + setOptimise(full, inc, preview); + assertEquals(expected, RepairOption.parse(options, Murmur3Partitioner.instance).optimiseStreams()); + } + + private void setOptimise(boolean full, boolean inc, boolean preview) + { + DatabaseDescriptor.setAutoOptimiseFullRepairStreams(full); + DatabaseDescriptor.setAutoOptimiseIncRepairStreams(inc); + DatabaseDescriptor.setAutoOptimisePreviewRepairStreams(preview); + } + + private void assertOptimise(PreviewKind previewKind, boolean incremental) + { + Map options = new HashMap<>(); + options.put(RepairOption.PREVIEW, previewKind.toString()); + options.put(RepairOption.INCREMENTAL_KEY, Boolean.toString(incremental)); + for (boolean a : new boolean[]{ true, false }) + { + for (boolean b : new boolean[]{ true, false }) + { + if (previewKind.isPreview()) + { + assertHelper(options, a, b, true, true); + assertHelper(options, a, b, false, false); + } + else if (incremental) + { + assertHelper(options, a, true, b, true); + assertHelper(options, a, false, b, false); + } + else + { + assertHelper(options, true, a, b, true); + assertHelper(options, false, a, b, false); + } + } + } } private void assertParseThrowsIllegalArgumentExceptionWithMessage(Map optionsToParse, String expectedErrorMessage) From f77e6cd3a020f3ad777c6bd36973ca084a978f1f Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Thu, 8 Sep 2022 13:09:57 -0400 Subject: [PATCH 064/159] Remove usage of deprecated javax certificate class Patch by Doug Rohrer; reviewed by Jyothsna Konisa, Francisco Guerrero Hernandez, Josh McKenzie, and Caleb Rackliffe for CASSANDRA-17867 Co-authored-by: Doug Rohrer Co-authored-by: Josh McKenzie --- CHANGES.txt | 1 + src/java/org/apache/cassandra/auth/IAuthenticator.java | 10 ++++++---- .../apache/cassandra/transport/ServerConnection.java | 8 ++++---- 3 files changed, 11 insertions(+), 8 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 224f671e8c2d..36b36b65be16 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Remove usage of deprecated javax certificate classes (CASSANDRA-17867) * Make sure preview repairs don't optimise streams unless configured to (CASSANDRA-17865) * Optionally avoid hint transfer during decommission (CASSANDRA-17808) * Make disabling auto snapshot on selected tables possible (CASSANDRA-10383) diff --git a/src/java/org/apache/cassandra/auth/IAuthenticator.java b/src/java/org/apache/cassandra/auth/IAuthenticator.java index 80ea719237b9..9963e4ec6bb6 100644 --- a/src/java/org/apache/cassandra/auth/IAuthenticator.java +++ b/src/java/org/apache/cassandra/auth/IAuthenticator.java @@ -18,11 +18,10 @@ package org.apache.cassandra.auth; import java.net.InetAddress; +import java.security.cert.Certificate; import java.util.Map; import java.util.Set; -import javax.security.cert.X509Certificate; - import org.apache.cassandra.exceptions.AuthenticationException; import org.apache.cassandra.exceptions.ConfigurationException; @@ -73,11 +72,14 @@ public interface IAuthenticator * override this method to gain access to client's certificate chain, if present. * @param clientAddress the IP address of the client whom we wish to authenticate, or null * if an internal client (one not connected over the remote transport). - * @param certificates the peer's X509 Certificate chain, if present. + * @param certificates the peer's Certificate chain, if present. + * It is expected that these will all be instances of {@link java.security.cert.X509Certificate}, + * but we pass them as the base {@link Certificate} in case future implementations leverage + * other certificate types. * @return org.apache.cassandra.auth.IAuthenticator.SaslNegotiator implementation * (see {@link org.apache.cassandra.auth.PasswordAuthenticator.PlainTextSaslAuthenticator}) */ - default SaslNegotiator newSaslNegotiator(InetAddress clientAddress, X509Certificate[] certificates) + default SaslNegotiator newSaslNegotiator(InetAddress clientAddress, Certificate[] certificates) { return newSaslNegotiator(clientAddress); } diff --git a/src/java/org/apache/cassandra/transport/ServerConnection.java b/src/java/org/apache/cassandra/transport/ServerConnection.java index 06e7842e90aa..dd6fa313ae97 100644 --- a/src/java/org/apache/cassandra/transport/ServerConnection.java +++ b/src/java/org/apache/cassandra/transport/ServerConnection.java @@ -17,8 +17,8 @@ */ package org.apache.cassandra.transport; +import java.security.cert.Certificate; import javax.net.ssl.SSLPeerUnverifiedException; -import javax.security.cert.X509Certificate; import org.slf4j.Logger; import org.slf4j.LoggerFactory; @@ -121,11 +121,11 @@ public IAuthenticator.SaslNegotiator getSaslNegotiator(QueryState queryState) return saslNegotiator; } - private X509Certificate[] certificates() + private Certificate[] certificates() { SslHandler sslHandler = (SslHandler) channel().pipeline() .get("ssl"); - X509Certificate[] certificates = null; + Certificate[] certificates = null; if (sslHandler != null) { @@ -133,7 +133,7 @@ private X509Certificate[] certificates() { certificates = sslHandler.engine() .getSession() - .getPeerCertificateChain(); + .getPeerCertificates(); } catch (SSLPeerUnverifiedException e) { From bd2ecb3454649d8c84cf0b1ce8c1e94ed1e06e74 Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Thu, 15 Sep 2022 14:22:34 -0400 Subject: [PATCH 065/159] Add metrics around storage usage and compression Patch by Caleb Rackliffe; reviewed by Abe Ratnofsky and Josh McKenzie for CASSANDRA-17898 Co-authored-by: Caleb Rackliffe Co-authored-by: Josh McKenzie --- CHANGES.txt | 1 + .../cassandra/db/lifecycle/Tracker.java | 10 ++ .../sstable/IndexSummaryRedistribution.java | 18 +- .../apache/cassandra/io/sstable/SSTable.java | 10 -- .../io/sstable/format/SSTableReader.java | 29 +++ .../cassandra/metrics/KeyspaceMetrics.java | 24 ++- .../cassandra/metrics/StorageMetrics.java | 20 +++ .../cassandra/metrics/TableMetrics.java | 3 + .../cassandra/service/StorageService.java | 8 +- .../service/StorageServiceMBean.java | 3 + .../org/apache/cassandra/tools/NodeProbe.java | 5 + .../apache/cassandra/tools/nodetool/Info.java | 2 + .../test/ClusterStorageUsageTest.java | 165 ++++++++++++++++++ .../distributed/test/NodeToolTest.java | 17 ++ .../cassandra/io/DiskSpaceMetricsTest.java | 33 +++- .../IndexSummaryRedistributionTest.java | 26 ++- .../io/sstable/SSTableRewriterTest.java | 14 +- 17 files changed, 356 insertions(+), 32 deletions(-) create mode 100644 test/distributed/org/apache/cassandra/distributed/test/ClusterStorageUsageTest.java diff --git a/CHANGES.txt b/CHANGES.txt index c59faa6fe117..0439b08b82b0 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Add metrics around storage usage and compression (CASSANDRA-17898) * Remove usage of deprecated javax certificate classes (CASSANDRA-17867) * Make sure preview repairs don't optimise streams unless configured to (CASSANDRA-17865) * Optionally avoid hint transfer during decommission (CASSANDRA-17808) diff --git a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java index ab8a74bd1a98..66ecf1c8d8c6 100644 --- a/src/java/org/apache/cassandra/db/lifecycle/Tracker.java +++ b/src/java/org/apache/cassandra/db/lifecycle/Tracker.java @@ -152,6 +152,8 @@ Throwable updateSizeTracking(Iterable oldSSTables, Iterable oldSSTables, Iterable oldSSTables, Iterable oldSSTables, Iterable transactions, long oldSize, long newSize) + private void addHooks(ColumnFamilyStore cfs, Map transactions, long oldSize, long newSize, long oldSizeUncompressed, long newSizeUncompressed) { LifecycleTransaction txn = transactions.get(cfs.metadata.id); txn.runOnCommit(() -> { // The new size will be added in Transactional.commit() as an updated SSTable, more details: CASSANDRA-13738 StorageMetrics.load.dec(oldSize); + StorageMetrics.uncompressedLoad.dec(oldSizeUncompressed); + cfs.metric.liveDiskSpaceUsed.dec(oldSize); + cfs.metric.uncompressedLiveDiskSpaceUsed.dec(oldSizeUncompressed); cfs.metric.totalDiskSpaceUsed.dec(oldSize); }); txn.runOnAbort(() -> { - // the local disk was modified but book keeping couldn't be commited, apply the delta + // the local disk was modified but bookkeeping couldn't be commited, apply the delta long delta = oldSize - newSize; // if new is larger this will be negative, so dec will become a inc + long deltaUncompressed = oldSizeUncompressed - newSizeUncompressed; + StorageMetrics.load.dec(delta); + StorageMetrics.uncompressedLoad.dec(deltaUncompressed); + cfs.metric.liveDiskSpaceUsed.dec(delta); + cfs.metric.uncompressedLiveDiskSpaceUsed.dec(deltaUncompressed); cfs.metric.totalDiskSpaceUsed.dec(delta); }); } diff --git a/src/java/org/apache/cassandra/io/sstable/SSTable.java b/src/java/org/apache/cassandra/io/sstable/SSTable.java index 5194abb10198..3c4f5cd0a9b0 100644 --- a/src/java/org/apache/cassandra/io/sstable/SSTable.java +++ b/src/java/org/apache/cassandra/io/sstable/SSTable.java @@ -320,16 +320,6 @@ public static long estimateRowsFromIndex(RandomAccessReader ifile, Descriptor de return estimatedRows; } - public long bytesOnDisk() - { - long bytes = 0; - for (Component component : components) - { - bytes += new File(descriptor.filenameFor(component)).length(); - } - return bytes; - } - @Override public String toString() { diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java index f26cf65c93e0..d7dad42b1674 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java +++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java @@ -2360,4 +2360,33 @@ public static void shutdownBlocking(long timeout, TimeUnit unit) throws Interrup ExecutorUtils.shutdownNowAndWait(timeout, unit, syncExecutor); resetTidying(); } + + /** + * @return the physical size on disk of all components for this SSTable in bytes + */ + public long bytesOnDisk() + { + return bytesOnDisk(false); + } + + /** + * @return the total logical/uncompressed size in bytes of all components for this SSTable + */ + public long logicalBytesOnDisk() + { + return bytesOnDisk(true); + } + + private long bytesOnDisk(boolean logical) + { + long bytes = 0; + for (Component component : components) + { + // Only the data file is compressable. + bytes += logical && component == Component.DATA && compression + ? getCompressionMetadata().dataLength + : new File(descriptor.filenameFor(component)).length(); + } + return bytes; + } } diff --git a/src/java/org/apache/cassandra/metrics/KeyspaceMetrics.java b/src/java/org/apache/cassandra/metrics/KeyspaceMetrics.java index 776027e395a1..bba4cd32134f 100644 --- a/src/java/org/apache/cassandra/metrics/KeyspaceMetrics.java +++ b/src/java/org/apache/cassandra/metrics/KeyspaceMetrics.java @@ -59,9 +59,14 @@ public class KeyspaceMetrics public final Gauge pendingFlushes; /** Estimate of number of pending compactios for this CF */ public final Gauge pendingCompactions; - /** Disk space used by SSTables belonging to this CF */ + /** Disk space used by SSTables belonging to tables in this keyspace */ public final Gauge liveDiskSpaceUsed; - /** Total disk space used by SSTables belonging to this CF, including obsolete ones waiting to be GC'd */ + /** Disk space used by SSTables belonging to tables in this keyspace, scaled down by replication factor */ + public final Gauge unreplicatedLiveDiskSpaceUsed; + /** Uncompressed/logical size of SSTables belonging to tables in this keyspace */ + public final Gauge uncompressedLiveDiskSpaceUsed; + /** Uncompressed/logical size of SSTables belonging to tables in this keyspace, scaled down by replication factor */ + public final Gauge unreplicatedUncompressedLiveDiskSpaceUsed; public final Gauge totalDiskSpaceUsed; /** Disk space used by bloom filter */ public final Gauge bloomFilterDiskSpaceUsed; @@ -169,7 +174,7 @@ public class KeyspaceMetrics public final Histogram rowIndexSize; public final MetricNameFactory factory; - private Keyspace keyspace; + private final Keyspace keyspace; /** set containing names of all the metrics stored here, for releasing later */ private Set allMetrics = Sets.newHashSet(); @@ -201,8 +206,15 @@ public KeyspaceMetrics(final Keyspace ks) metric -> metric.memtableSwitchCount.getCount()); pendingCompactions = createKeyspaceGauge("PendingCompactions", metric -> metric.pendingCompactions.getValue()); pendingFlushes = createKeyspaceGauge("PendingFlushes", metric -> metric.pendingFlushes.getCount()); + liveDiskSpaceUsed = createKeyspaceGauge("LiveDiskSpaceUsed", metric -> metric.liveDiskSpaceUsed.getCount()); + uncompressedLiveDiskSpaceUsed = createKeyspaceGauge("UncompressedLiveDiskSpaceUsed", metric -> metric.uncompressedLiveDiskSpaceUsed.getCount()); + unreplicatedLiveDiskSpaceUsed = createKeyspaceGauge("UnreplicatedLiveDiskSpaceUsed", + metric -> metric.liveDiskSpaceUsed.getCount() / keyspace.getReplicationStrategy().getReplicationFactor().fullReplicas); + unreplicatedUncompressedLiveDiskSpaceUsed = createKeyspaceGauge("UnreplicatedUncompressedLiveDiskSpaceUsed", + metric -> metric.uncompressedLiveDiskSpaceUsed.getCount() / keyspace.getReplicationStrategy().getReplicationFactor().fullReplicas); totalDiskSpaceUsed = createKeyspaceGauge("TotalDiskSpaceUsed", metric -> metric.totalDiskSpaceUsed.getCount()); + bloomFilterDiskSpaceUsed = createKeyspaceGauge("BloomFilterDiskSpaceUsed", metric -> metric.bloomFilterDiskSpaceUsed.getValue()); bloomFilterOffHeapMemoryUsed = createKeyspaceGauge("BloomFilterOffHeapMemoryUsed", @@ -280,8 +292,10 @@ public void release() /** * Creates a gauge that will sum the current value of a metric for all column families in this keyspace - * @param name - * @param extractor + * + * @param name the name of the metric being created + * @param extractor a function that produces a specified metric value for a given table + * * @return Gauge>Long> that computes sum of MetricValue.getValue() */ private Gauge createKeyspaceGauge(String name, final ToLongFunction extractor) diff --git a/src/java/org/apache/cassandra/metrics/StorageMetrics.java b/src/java/org/apache/cassandra/metrics/StorageMetrics.java index 9399ba67893a..d86a2144f248 100644 --- a/src/java/org/apache/cassandra/metrics/StorageMetrics.java +++ b/src/java/org/apache/cassandra/metrics/StorageMetrics.java @@ -17,7 +17,12 @@ */ package org.apache.cassandra.metrics; +import java.util.function.ToLongFunction; +import java.util.stream.StreamSupport; + import com.codahale.metrics.Counter; +import com.codahale.metrics.Gauge; +import org.apache.cassandra.db.Keyspace; import static org.apache.cassandra.metrics.CassandraMetricsRegistry.Metrics; @@ -29,8 +34,23 @@ public class StorageMetrics private static final MetricNameFactory factory = new DefaultNameFactory("Storage"); public static final Counter load = Metrics.counter(factory.createMetricName("Load")); + public static final Counter uncompressedLoad = Metrics.counter(factory.createMetricName("UncompressedLoad")); + + public static final Gauge unreplicatedLoad = + createSummingGauge("UnreplicatedLoad", metric -> metric.unreplicatedLiveDiskSpaceUsed.getValue()); + public static final Gauge unreplicatedUncompressedLoad = + createSummingGauge("UnreplicatedUncompressedLoad", metric -> metric.unreplicatedUncompressedLiveDiskSpaceUsed.getValue()); + public static final Counter uncaughtExceptions = Metrics.counter(factory.createMetricName("Exceptions")); public static final Counter totalHintsInProgress = Metrics.counter(factory.createMetricName("TotalHintsInProgress")); public static final Counter totalHints = Metrics.counter(factory.createMetricName("TotalHints")); public static final Counter repairExceptions = Metrics.counter(factory.createMetricName("RepairExceptions")); + + private static Gauge createSummingGauge(String name, ToLongFunction extractor) + { + return Metrics.register(factory.createMetricName(name), + () -> StreamSupport.stream(Keyspace.all().spliterator(), false) + .mapToLong(keyspace -> extractor.applyAsLong(keyspace.metric)) + .sum()); + } } diff --git a/src/java/org/apache/cassandra/metrics/TableMetrics.java b/src/java/org/apache/cassandra/metrics/TableMetrics.java index 5e7ab78b29e9..8f3645dd1e63 100644 --- a/src/java/org/apache/cassandra/metrics/TableMetrics.java +++ b/src/java/org/apache/cassandra/metrics/TableMetrics.java @@ -122,6 +122,8 @@ public class TableMetrics public final Gauge oldVersionSSTableCount; /** Disk space used by SSTables belonging to this table */ public final Counter liveDiskSpaceUsed; + /** Uncompressed/logical disk space used by SSTables belonging to this table */ + public final Counter uncompressedLiveDiskSpaceUsed; /** Total disk space used by SSTables belonging to this table, including obsolete ones waiting to be GC'd */ public final Counter totalDiskSpaceUsed; /** Size of the smallest compacted partition */ @@ -605,6 +607,7 @@ public Integer getValue() } }); liveDiskSpaceUsed = createTableCounter("LiveDiskSpaceUsed"); + uncompressedLiveDiskSpaceUsed = createTableCounter("UncompressedLiveDiskSpaceUsed"); totalDiskSpaceUsed = createTableCounter("TotalDiskSpaceUsed"); minPartitionSize = createTableGauge("MinPartitionSize", "MinRowSize", new Gauge() { diff --git a/src/java/org/apache/cassandra/service/StorageService.java b/src/java/org/apache/cassandra/service/StorageService.java index 47fad4a7e09b..70f0f1720311 100644 --- a/src/java/org/apache/cassandra/service/StorageService.java +++ b/src/java/org/apache/cassandra/service/StorageService.java @@ -3553,12 +3553,18 @@ public void onRestart(InetAddressAndPort endpoint, EndpointState state) updateNetVersion(endpoint, netVersion); } - + @Override public String getLoadString() { return FileUtils.stringifyFileSize(StorageMetrics.load.getCount()); } + @Override + public String getUncompressedLoadString() + { + return FileUtils.stringifyFileSize(StorageMetrics.uncompressedLoad.getCount()); + } + public Map getLoadMapWithPort() { return getLoadMap(true); diff --git a/src/java/org/apache/cassandra/service/StorageServiceMBean.java b/src/java/org/apache/cassandra/service/StorageServiceMBean.java index 7e512cdaf7f6..d530dd6b3a3f 100644 --- a/src/java/org/apache/cassandra/service/StorageServiceMBean.java +++ b/src/java/org/apache/cassandra/service/StorageServiceMBean.java @@ -209,6 +209,9 @@ public interface StorageServiceMBean extends NotificationEmitter /** Human-readable load value */ public String getLoadString(); + /** Human-readable uncompressed load value */ + public String getUncompressedLoadString(); + /** Human-readable load value. Keys are IP addresses. */ @Deprecated public Map getLoadMap(); public Map getLoadMapWithPort(); diff --git a/src/java/org/apache/cassandra/tools/NodeProbe.java b/src/java/org/apache/cassandra/tools/NodeProbe.java index daf4eb251834..60edad3d1fe8 100644 --- a/src/java/org/apache/cassandra/tools/NodeProbe.java +++ b/src/java/org/apache/cassandra/tools/NodeProbe.java @@ -770,6 +770,11 @@ public String getLoadString() return ssProxy.getLoadString(); } + public String getUncompressedLoadString() + { + return ssProxy.getUncompressedLoadString(); + } + public String getReleaseVersion() { return ssProxy.getReleaseVersion(); diff --git a/src/java/org/apache/cassandra/tools/nodetool/Info.java b/src/java/org/apache/cassandra/tools/nodetool/Info.java index 69661571a9eb..90c9e9d8bd88 100644 --- a/src/java/org/apache/cassandra/tools/nodetool/Info.java +++ b/src/java/org/apache/cassandra/tools/nodetool/Info.java @@ -51,6 +51,8 @@ public void execute(NodeProbe probe) out.printf("%-23s: %s%n", "Gossip active", gossipInitialized); out.printf("%-23s: %s%n", "Native Transport active", probe.isNativeTransportRunning()); out.printf("%-23s: %s%n", "Load", probe.getLoadString()); + out.printf("%-23s: %s%n", "Uncompressed load", probe.getUncompressedLoadString()); + if (gossipInitialized) out.printf("%-23s: %s%n", "Generation No", probe.getCurrentGenerationNumber()); else diff --git a/test/distributed/org/apache/cassandra/distributed/test/ClusterStorageUsageTest.java b/test/distributed/org/apache/cassandra/distributed/test/ClusterStorageUsageTest.java new file mode 100644 index 000000000000..9c379d1d5051 --- /dev/null +++ b/test/distributed/org/apache/cassandra/distributed/test/ClusterStorageUsageTest.java @@ -0,0 +1,165 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.cassandra.distributed.test; + +import org.junit.Test; + +import org.apache.cassandra.db.ColumnFamilyStore; +import org.apache.cassandra.db.Keyspace; +import org.apache.cassandra.distributed.api.IInvokableInstance; +import org.apache.cassandra.distributed.api.IIsolatedExecutor.SerializableFunction; +import org.apache.cassandra.metrics.DefaultNameFactory; + +import org.apache.cassandra.distributed.Cluster; +import org.apache.cassandra.distributed.api.ConsistencyLevel; + +import static org.assertj.core.api.Assertions.assertThat; + +/** + * This class verifies the behavior of our global disk usage metrics across different cluster and replication + * configurations. In addition, it verifies that they are properly exposed via the public metric registry. + * + * Disk usage metrics are characterized by how they handle compression and replication: + * + * "compressed" -> indicates raw disk usage + * "uncompressed" -> indicates uncompressed file size (which is equivalent to "compressed" with no compression enabled) + * "replicated" -> includes disk usage for data outside the node's primary range + * "unreplicated" -> indicates disk usage scaled down by replication factor across the entire cluster + */ +public class ClusterStorageUsageTest extends TestBaseImpl +{ + private static final DefaultNameFactory FACTORY = new DefaultNameFactory("Storage"); + private static final int MUTATIONS = 1000; + + @Test + public void testNoReplication() throws Throwable + { + // With a replication factor of 1 for our only user keyspace, system keyspaces using local replication, and + // empty distributed system tables, replicated and unreplicated versions of our compressed and uncompressed + // metrics should be equivalent. + + try (Cluster cluster = init(builder().withNodes(2).start(), 1)) + { + populateUserKeyspace(cluster); + verifyLoadMetricsWithoutReplication(cluster.get(1)); + verifyLoadMetricsWithoutReplication(cluster.get(2)); + } + } + + private void verifyLoadMetricsWithoutReplication(IInvokableInstance node) + { + long compressedLoad = getLoad(node); + long uncompressedLoad = getUncompressedLoad(node); + assertThat(compressedLoad).isEqualTo(getUnreplicatedLoad(node)); + assertThat(uncompressedLoad).isEqualTo(getUnreplicatedUncompressedLoad(node)); + assertThat(uncompressedLoad).isGreaterThan(compressedLoad); + } + + @Test + public void testSimpleReplication() throws Throwable + { + // With a replication factor of 2 for our only user keyspace, disk space used by that keyspace should + // be scaled down by a factor of 2, while contributions from system keyspaces are unaffected. + + try (Cluster cluster = init(builder().withNodes(3).start(), 2)) + { + populateUserKeyspace(cluster); + + verifyLoadMetricsWithReplication(cluster.get(1)); + verifyLoadMetricsWithReplication(cluster.get(2)); + verifyLoadMetricsWithReplication(cluster.get(3)); + } + } + + @Test + public void testMultiDatacenterReplication() throws Throwable + { + // With a replication factor of 1 for our only user keyspace in two DCs, disk space used by that keyspace should + // be scaled down by a factor of 2, while contributions from system keyspaces are unaffected. + + try (Cluster cluster = builder().withDC("DC1", 2).withDC("DC2", 2).start()) + { + cluster.schemaChange("CREATE KEYSPACE " + KEYSPACE + " WITH replication = {'class': 'NetworkTopologyStrategy', 'DC1': 1, 'DC2': 1};"); + populateUserKeyspace(cluster); + + verifyLoadMetricsWithReplication(cluster.get(1)); + verifyLoadMetricsWithReplication(cluster.get(2)); + verifyLoadMetricsWithReplication(cluster.get(3)); + verifyLoadMetricsWithReplication(cluster.get(4)); + } + } + + private void populateUserKeyspace(Cluster cluster) + { + cluster.schemaChange(withKeyspace("CREATE TABLE %s.tbl (pk int, v text, PRIMARY KEY (pk));")); + + for (int i = 0; i < MUTATIONS; i++) { + cluster.coordinator(1).execute(withKeyspace("INSERT INTO %s.tbl (pk, v) VALUES (?,?)"), ConsistencyLevel.ALL, i, "compressable"); + } + + cluster.forEach((i) -> i.flush(KEYSPACE)); + } + + private void verifyLoadMetricsWithReplication(IInvokableInstance node) + { + long unreplicatedLoad = getUnreplicatedLoad(node); + long expectedUnreplicatedLoad = computeUnreplicatedMetric(node, table -> table.metric.liveDiskSpaceUsed.getCount()); + assertThat(expectedUnreplicatedLoad).isEqualTo(unreplicatedLoad); + assertThat(getLoad(node)).isGreaterThan(unreplicatedLoad); + + long unreplicatedUncompressedLoad = getUnreplicatedUncompressedLoad(node); + long expectedUnreplicatedUncompressedLoad = computeUnreplicatedMetric(node, table -> table.metric.uncompressedLiveDiskSpaceUsed.getCount()); + assertThat(expectedUnreplicatedUncompressedLoad).isEqualTo(unreplicatedUncompressedLoad); + assertThat(getUncompressedLoad(node)).isGreaterThan(unreplicatedUncompressedLoad); + } + + private long getLoad(IInvokableInstance node) + { + return node.metrics().getCounter(FACTORY.createMetricName("Load").getMetricName()); + } + + private long getUncompressedLoad(IInvokableInstance node1) + { + return node1.metrics().getCounter(FACTORY.createMetricName("UncompressedLoad").getMetricName()); + } + + private long getUnreplicatedLoad(IInvokableInstance node) + { + return (Long) node.metrics().getGauge(FACTORY.createMetricName("UnreplicatedLoad").getMetricName()); + } + + private long getUnreplicatedUncompressedLoad(IInvokableInstance node) + { + return (Long) node.metrics().getGauge(FACTORY.createMetricName("UnreplicatedUncompressedLoad").getMetricName()); + } + + private long computeUnreplicatedMetric(IInvokableInstance node, SerializableFunction metric) + { + return node.callOnInstance(() -> + { + long sum = 0; + + for (Keyspace keyspace : Keyspace.all()) + for (ColumnFamilyStore table : keyspace.getColumnFamilyStores()) + sum += metric.apply(table) / keyspace.getReplicationStrategy().getReplicationFactor().fullReplicas; + + return sum; + }); + } +} \ No newline at end of file diff --git a/test/distributed/org/apache/cassandra/distributed/test/NodeToolTest.java b/test/distributed/org/apache/cassandra/distributed/test/NodeToolTest.java index 9087f9629606..486d9f5618f8 100644 --- a/test/distributed/org/apache/cassandra/distributed/test/NodeToolTest.java +++ b/test/distributed/org/apache/cassandra/distributed/test/NodeToolTest.java @@ -116,4 +116,21 @@ public void testSetCacheCapacityWhenDisabled() throws Throwable ringResult.asserts().stderrContains("is not permitted as this cache is disabled"); } } + + @Test + public void testInfoOutput() throws Throwable + { + try (ICluster cluster = init(builder().withNodes(1).start())) + { + NodeToolResult ringResult = cluster.get(1).nodetoolResult("info"); + ringResult.asserts().stdoutContains("ID"); + ringResult.asserts().stdoutContains("Gossip active"); + ringResult.asserts().stdoutContains("Native Transport active"); + ringResult.asserts().stdoutContains("Load"); + ringResult.asserts().stdoutContains("Uncompressed load"); + ringResult.asserts().stdoutContains("Generation"); + ringResult.asserts().stdoutContains("Uptime"); + ringResult.asserts().stdoutContains("Heap Memory"); + } + } } diff --git a/test/unit/org/apache/cassandra/io/DiskSpaceMetricsTest.java b/test/unit/org/apache/cassandra/io/DiskSpaceMetricsTest.java index be8b16276668..f07936593aa0 100644 --- a/test/unit/org/apache/cassandra/io/DiskSpaceMetricsTest.java +++ b/test/unit/org/apache/cassandra/io/DiskSpaceMetricsTest.java @@ -21,6 +21,7 @@ import java.io.IOException; import java.util.List; import java.util.Map; +import java.util.Set; import java.util.concurrent.atomic.AtomicInteger; import com.google.common.collect.ImmutableMap; @@ -38,9 +39,13 @@ import org.apache.cassandra.io.sstable.IndexSummaryManager; import org.apache.cassandra.io.sstable.IndexSummaryRedistribution; import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.metrics.StorageMetrics; import org.apache.cassandra.schema.TableId; import org.apache.cassandra.utils.FBUtilities; +import static org.assertj.core.api.Assertions.assertThat; +import static org.junit.Assert.assertEquals; + public class DiskSpaceMetricsTest extends CQLTester { /** @@ -102,18 +107,36 @@ private void insertN(ColumnFamilyStore cfs, int n, long base) throws Throwable private void assertDiskSpaceEqual(ColumnFamilyStore cfs) { + Set liveSSTables = cfs.getTracker().getView().liveSSTables(); long liveDiskSpaceUsed = cfs.metric.liveDiskSpaceUsed.getCount(); - long actual = 0; - for (SSTableReader sstable : cfs.getTracker().getView().liveSSTables()) - actual += sstable.bytesOnDisk(); + long actual = liveSSTables.stream().mapToLong(SSTableReader::bytesOnDisk).sum(); + long uncompressedLiveDiskSpaceUsed = cfs.metric.uncompressedLiveDiskSpaceUsed.getCount(); + long actualUncompressed = liveSSTables.stream().mapToLong(SSTableReader::logicalBytesOnDisk).sum(); + + assertEquals("bytes on disk does not match current metric LiveDiskSpaceUsed", actual, liveDiskSpaceUsed); + assertEquals("bytes on disk does not match current metric UncompressedLiveDiskSpaceUsed", actualUncompressed, uncompressedLiveDiskSpaceUsed); + + // Keyspace-level metrics should be equivalent to table-level metrics, as there is only one table. + assertEquals(cfs.keyspace.metric.liveDiskSpaceUsed.getValue().longValue(), liveDiskSpaceUsed); + assertEquals(cfs.keyspace.metric.uncompressedLiveDiskSpaceUsed.getValue().longValue(), uncompressedLiveDiskSpaceUsed); + assertEquals(cfs.keyspace.metric.unreplicatedLiveDiskSpaceUsed.getValue().longValue(), liveDiskSpaceUsed); + assertEquals(cfs.keyspace.metric.unreplicatedUncompressedLiveDiskSpaceUsed.getValue().longValue(), uncompressedLiveDiskSpaceUsed); + + // Global load metrics should be internally consistent, given there is no replication, but slightly greater + // than table and keyspace-level metrics, given the global versions account for non-user tables. + long globalLoad = StorageMetrics.load.getCount(); + assertEquals(globalLoad, StorageMetrics.unreplicatedLoad.getValue().longValue()); + assertThat(globalLoad).isGreaterThan(liveDiskSpaceUsed); - Assert.assertEquals("bytes on disk does not match current metric liveDiskSpaceUsed", actual, liveDiskSpaceUsed); + long globalUncompressedLoad = StorageMetrics.uncompressedLoad.getCount(); + assertEquals(globalUncompressedLoad, StorageMetrics.unreplicatedUncompressedLoad.getValue().longValue()); + assertThat(globalUncompressedLoad).isGreaterThan(uncompressedLiveDiskSpaceUsed); // totalDiskSpaceUsed is based off SStable delete, which is async: LogTransaction's tidy enqueues in ScheduledExecutors.nonPeriodicTasks // wait for there to be no more pending sstable releases LifecycleTransaction.waitForDeletions(); long totalDiskSpaceUsed = cfs.metric.totalDiskSpaceUsed.getCount(); - Assert.assertEquals("bytes on disk does not match current metric totalDiskSpaceUsed", actual, totalDiskSpaceUsed); + assertEquals("bytes on disk does not match current metric totalDiskSpaceUsed", actual, totalDiskSpaceUsed); } private static void indexDownsampleCancelLastSSTable(ColumnFamilyStore cfs) diff --git a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryRedistributionTest.java b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryRedistributionTest.java index 57e4d4e4bd4c..9c121bf7bfcf 100644 --- a/test/unit/org/apache/cassandra/io/sstable/IndexSummaryRedistributionTest.java +++ b/test/unit/org/apache/cassandra/io/sstable/IndexSummaryRedistributionTest.java @@ -32,6 +32,7 @@ import org.apache.cassandra.db.ColumnFamilyStore; import org.apache.cassandra.db.Keyspace; import org.apache.cassandra.db.RowUpdateBuilder; +import org.apache.cassandra.db.commitlog.CommitLogPosition; import org.apache.cassandra.exceptions.ConfigurationException; import org.apache.cassandra.io.sstable.format.SSTableReader; import org.apache.cassandra.metrics.RestorableMeter; @@ -68,8 +69,12 @@ public void testMetricsLoadAfterRedistribution() throws IOException ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(cfname); int numSSTables = 1; int numRows = 1024 * 10; + long load = StorageMetrics.load.getCount(); StorageMetrics.load.dec(load); // reset the load metric + long uncompressedLoad = StorageMetrics.uncompressedLoad.getCount(); + StorageMetrics.uncompressedLoad.dec(uncompressedLoad); // reset the uncompressed load metric + createSSTables(ksname, cfname, numSSTables, numRows); List sstables = new ArrayList<>(cfs.getLiveSSTables()); @@ -77,32 +82,45 @@ public void testMetricsLoadAfterRedistribution() throws IOException sstable.overrideReadMeter(new RestorableMeter(100.0, 100.0)); long oldSize = 0; + long oldSizeUncompressed = 0; + for (SSTableReader sstable : sstables) { assertEquals(cfs.metadata().params.minIndexInterval, sstable.getEffectiveIndexInterval(), 0.001); oldSize += sstable.bytesOnDisk(); + oldSizeUncompressed += sstable.logicalBytesOnDisk(); } load = StorageMetrics.load.getCount(); long others = load - oldSize; // Other SSTables size, e.g. schema and other system SSTables + uncompressedLoad = StorageMetrics.uncompressedLoad.getCount(); + long othersUncompressed = uncompressedLoad - oldSizeUncompressed; + int originalMinIndexInterval = cfs.metadata().params.minIndexInterval; // double the min_index_interval SchemaTestUtil.announceTableUpdate(cfs.metadata().unbuild().minIndexInterval(originalMinIndexInterval * 2).build()); IndexSummaryManager.instance.redistributeSummaries(); long newSize = 0; + long newSizeUncompressed = 0; + for (SSTableReader sstable : cfs.getLiveSSTables()) { assertEquals(cfs.metadata().params.minIndexInterval, sstable.getEffectiveIndexInterval(), 0.001); assertEquals(numRows / cfs.metadata().params.minIndexInterval, sstable.getIndexSummarySize()); newSize += sstable.bytesOnDisk(); + newSizeUncompressed += sstable.logicalBytesOnDisk(); } + newSize += others; load = StorageMetrics.load.getCount(); - // new size we calculate should be almost the same as the load in metrics - assertEquals(newSize, load, newSize / 10); + assertEquals(newSize, load, newSize / 10.0); + + newSizeUncompressed += othersUncompressed; + uncompressedLoad = StorageMetrics.uncompressedLoad.getCount(); + assertEquals(newSizeUncompressed, uncompressedLoad, newSizeUncompressed / 10.0); } private void createSSTables(String ksname, String cfname, int numSSTables, int numRows) @@ -112,7 +130,7 @@ private void createSSTables(String ksname, String cfname, int numSSTables, int n cfs.truncateBlocking(); cfs.disableAutoCompaction(); - ArrayList futures = new ArrayList<>(numSSTables); + ArrayList> futures = new ArrayList<>(numSSTables); ByteBuffer value = ByteBuffer.wrap(new byte[100]); for (int sstable = 0; sstable < numSSTables; sstable++) { @@ -127,7 +145,7 @@ private void createSSTables(String ksname, String cfname, int numSSTables, int n } futures.add(cfs.forceFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS)); } - for (Future future : futures) + for (Future future : futures) { try { diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java index c88e0b09f732..363a0001868a 100644 --- a/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java +++ b/test/unit/org/apache/cassandra/io/sstable/SSTableRewriterTest.java @@ -205,7 +205,9 @@ public void testNumberOfFilesAndSizes() SSTableReader s = writeFile(cfs, 1000); cfs.addSSTable(s); long startStorageMetricsLoad = StorageMetrics.load.getCount(); + long startUncompressedLoad = StorageMetrics.uncompressedLoad.getCount(); long sBytesOnDisk = s.bytesOnDisk(); + long sBytesOnDiskUncompressed = s.logicalBytesOnDisk(); Set compacting = Sets.newHashSet(s); List sstables; @@ -236,11 +238,15 @@ public void testNumberOfFilesAndSizes() LifecycleTransaction.waitForDeletions(); - long sum = 0; - for (SSTableReader x : cfs.getLiveSSTables()) - sum += x.bytesOnDisk(); + long sum = cfs.getLiveSSTables().stream().mapToLong(SSTableReader::bytesOnDisk).sum(); assertEquals(sum, cfs.metric.liveDiskSpaceUsed.getCount()); - assertEquals(startStorageMetricsLoad - sBytesOnDisk + sum, StorageMetrics.load.getCount()); + long endLoad = StorageMetrics.load.getCount(); + assertEquals(startStorageMetricsLoad - sBytesOnDisk + sum, endLoad); + + long uncompressedSum = cfs.getLiveSSTables().stream().mapToLong(t -> t.logicalBytesOnDisk()).sum(); + long endUncompressedLoad = StorageMetrics.uncompressedLoad.getCount(); + assertEquals(startUncompressedLoad - sBytesOnDiskUncompressed + uncompressedSum, endUncompressedLoad); + assertEquals(files, sstables.size()); assertEquals(files, cfs.getLiveSSTables().size()); LifecycleTransaction.waitForDeletions(); From ff5f4833aa3e11fcffd6bff1b15597fd5a38b864 Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Thu, 15 Sep 2022 13:56:00 -0400 Subject: [PATCH 066/159] Prevent NullPointerException when changing neverPurgeTombstones from true to false Patch by Marcus Eriksson; reviewed by Caleb Rackliffe and Josh McKenzie for CASSANDRA-17897 Co-authored-by: Marcus Eriksson Co-authored-by: Josh McKenzie --- CHANGES.txt | 1 + .../db/compaction/CompactionController.java | 16 +++------ .../compaction/CompactionControllerTest.java | 33 +++++++++++++++++++ 3 files changed, 39 insertions(+), 11 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 0439b08b82b0..10830cedb0b9 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Prevent NullPointerException when changing neverPurgeTombstones from true to false (CASSANDRA-17897) * Add metrics around storage usage and compression (CASSANDRA-17898) * Remove usage of deprecated javax certificate classes (CASSANDRA-17867) * Make sure preview repairs don't optimise streams unless configured to (CASSANDRA-17865) diff --git a/src/java/org/apache/cassandra/db/compaction/CompactionController.java b/src/java/org/apache/cassandra/db/compaction/CompactionController.java index 26dcdd39a60e..31b097fc5b5c 100644 --- a/src/java/org/apache/cassandra/db/compaction/CompactionController.java +++ b/src/java/org/apache/cassandra/db/compaction/CompactionController.java @@ -105,14 +105,8 @@ public void maybeRefreshOverlaps() return; } - for (SSTableReader reader : overlappingSSTables) - { - if (reader.isMarkedCompacted()) - { - refreshOverlaps(); - return; - } - } + if (overlappingSSTables == null || overlappingSSTables.stream().anyMatch(SSTableReader::isMarkedCompacted)) + refreshOverlaps(); } private void refreshOverlaps() @@ -160,8 +154,8 @@ public static Set getFullyExpiredSSTables(ColumnFamilyStore cfSto { logger.trace("Checking droppable sstables in {}", cfStore); - if (NEVER_PURGE_TOMBSTONES || compacting == null || cfStore.getNeverPurgeTombstones()) - return Collections.emptySet(); + if (NEVER_PURGE_TOMBSTONES || compacting == null || cfStore.getNeverPurgeTombstones() || overlapping == null) + return Collections.emptySet(); if (cfStore.getCompactionStrategyManager().onlyPurgeRepairedTombstones() && !Iterables.all(compacting, SSTableReader::isRepaired)) return Collections.emptySet(); @@ -243,7 +237,7 @@ public static Set getFullyExpiredSSTables(ColumnFamilyStore cfSto @Override public LongPredicate getPurgeEvaluator(DecoratedKey key) { - if (NEVER_PURGE_TOMBSTONES || !compactingRepaired() || cfs.getNeverPurgeTombstones()) + if (NEVER_PURGE_TOMBSTONES || !compactingRepaired() || cfs.getNeverPurgeTombstones() || overlapIterator == null) return time -> false; overlapIterator.update(key); diff --git a/test/unit/org/apache/cassandra/db/compaction/CompactionControllerTest.java b/test/unit/org/apache/cassandra/db/compaction/CompactionControllerTest.java index 9d81b61ed377..ce9b28ad9efe 100644 --- a/test/unit/org/apache/cassandra/db/compaction/CompactionControllerTest.java +++ b/test/unit/org/apache/cassandra/db/compaction/CompactionControllerTest.java @@ -204,4 +204,37 @@ private void assertPurgeBoundary(LongPredicate evaluator, long boundary) assertFalse(evaluator.test(boundary)); assertTrue(evaluator.test(boundary - 1)); } + + @Test + public void testDisableNeverPurgeTombstones() + { + Keyspace keyspace = Keyspace.open(KEYSPACE); + ColumnFamilyStore cfs = keyspace.getColumnFamilyStore(CF2); + cfs.truncateBlocking(); + + DecoratedKey key = Util.dk("k1"); + long timestamp = System.currentTimeMillis(); + applyMutation(cfs.metadata(), key, timestamp); + cfs.forceBlockingFlush(ColumnFamilyStore.FlushReason.UNIT_TESTS); + Set toCompact = Sets.newHashSet(cfs.getLiveSSTables()); + cfs.setNeverPurgeTombstones(true); + applyMutation(cfs.metadata(), key, timestamp + 1); + + try (CompactionController cc = new CompactionController(cfs, toCompact, (int)(System.currentTimeMillis()/1000))) + { + assertFalse(cc.getPurgeEvaluator(key).test(timestamp)); + assertFalse(cc.getPurgeEvaluator(key).test(timestamp + 1)); + assertTrue(cc.getFullyExpiredSSTables().isEmpty()); + + cfs.setNeverPurgeTombstones(false); + assertFalse(cc.getPurgeEvaluator(key).test(timestamp)); + assertFalse(cc.getPurgeEvaluator(key).test(timestamp + 1)); + assertTrue(cc.getFullyExpiredSSTables().isEmpty()); + + cc.maybeRefreshOverlaps(); + assertTrue(cc.getPurgeEvaluator(key).test(timestamp)); + assertFalse(cc.getPurgeEvaluator(key).test(timestamp + 1)); + assertTrue(cc.getFullyExpiredSSTables().isEmpty()); + } + } } From e89b214d069321c8968871b8eb7d51d4dfba7c33 Mon Sep 17 00:00:00 2001 From: Josh McKenzie Date: Tue, 13 Sep 2022 12:48:00 -0400 Subject: [PATCH 067/159] Allow disabling hotness persistence for high sstable counts Patch by Caleb Rackliffe; reviewed by Chris Lohfink and Josh McKenzie for CASSANDRA-17868 Co-authored-by: Caleb Rackliffe Co-authored-by: Josh McKenzie --- CHANGES.txt | 1 + .../org/apache/cassandra/config/Config.java | 2 + .../cassandra/config/DatabaseDescriptor.java | 14 ++++ .../apache/cassandra/db/SystemKeyspace.java | 11 +++- .../io/sstable/format/SSTableReader.java | 25 +++++--- .../cassandra/service/StorageProxy.java | 12 ++++ .../cassandra/service/StorageProxyMBean.java | 3 + .../io/sstable/SSTableReaderTest.java | 64 ++++++++++++++----- 8 files changed, 103 insertions(+), 29 deletions(-) diff --git a/CHANGES.txt b/CHANGES.txt index 10830cedb0b9..49858daa9ab8 100644 --- a/CHANGES.txt +++ b/CHANGES.txt @@ -1,4 +1,5 @@ 4.2 + * Allow disabling hotness persistence for high sstable counts (CASSANDRA-17868) * Prevent NullPointerException when changing neverPurgeTombstones from true to false (CASSANDRA-17897) * Add metrics around storage usage and compression (CASSANDRA-17898) * Remove usage of deprecated javax certificate classes (CASSANDRA-17867) diff --git a/src/java/org/apache/cassandra/config/Config.java b/src/java/org/apache/cassandra/config/Config.java index c3a406b455d2..168b62a32a8f 100644 --- a/src/java/org/apache/cassandra/config/Config.java +++ b/src/java/org/apache/cassandra/config/Config.java @@ -1052,6 +1052,8 @@ public enum PaxosOnLinearizabilityViolation */ public volatile int paxos_repair_parallelism = -1; + public volatile boolean sstable_read_rate_persistence_enabled = false; + public volatile int max_top_size_partition_count = 10; public volatile int max_top_tombstone_partition_count = 10; public volatile DataStorageSpec.LongBytesBound min_tracked_partition_size = new DataStorageSpec.LongBytesBound("1MiB"); diff --git a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java index 482e95fa7560..d38bc46b2dc8 100644 --- a/src/java/org/apache/cassandra/config/DatabaseDescriptor.java +++ b/src/java/org/apache/cassandra/config/DatabaseDescriptor.java @@ -4496,4 +4496,18 @@ public static void setDumpHeapOnUncaughtException(boolean enabled) else logger.info("Setting dump_heap_on_uncaught_exception to {}", enabled); } + + public static boolean getSStableReadRatePersistenceEnabled() + { + return conf.sstable_read_rate_persistence_enabled; + } + + public static void setSStableReadRatePersistenceEnabled(boolean enabled) + { + if (enabled != conf.sstable_read_rate_persistence_enabled) + { + logger.info("Setting sstable_read_rate_persistence_enabled to {}", enabled); + conf.sstable_read_rate_persistence_enabled = enabled; + } + } } diff --git a/src/java/org/apache/cassandra/db/SystemKeyspace.java b/src/java/org/apache/cassandra/db/SystemKeyspace.java index 4e11c93d9336..533d35ee2c29 100644 --- a/src/java/org/apache/cassandra/db/SystemKeyspace.java +++ b/src/java/org/apache/cassandra/db/SystemKeyspace.java @@ -81,6 +81,7 @@ import org.apache.cassandra.dht.Range; import org.apache.cassandra.dht.Token; import org.apache.cassandra.exceptions.ConfigurationException; +import org.apache.cassandra.io.sstable.SSTable; import org.apache.cassandra.io.sstable.SSTableId; import org.apache.cassandra.io.sstable.SequenceBasedSSTableId; import org.apache.cassandra.io.util.DataInputBuffer; @@ -1476,8 +1477,7 @@ public static PaxosRepairHistory loadPaxosRepairHistory(String keyspace, String */ public static RestorableMeter getSSTableReadMeter(String keyspace, String table, SSTableId id) { - String cql = "SELECT * FROM system.%s WHERE keyspace_name=? and table_name=? and id=?"; - UntypedResultSet results = executeInternal(format(cql, SSTABLE_ACTIVITY_V2), keyspace, table, id.toString()); + UntypedResultSet results = readSSTableActivity(keyspace, table, id); if (results.isEmpty()) return new RestorableMeter(); @@ -1488,6 +1488,13 @@ public static RestorableMeter getSSTableReadMeter(String keyspace, String table, return new RestorableMeter(m15rate, m120rate); } + @VisibleForTesting + public static UntypedResultSet readSSTableActivity(String keyspace, String table, SSTableId id) + { + String cql = "SELECT * FROM system.%s WHERE keyspace_name=? and table_name=? and id=?"; + return executeInternal(format(cql, SSTABLE_ACTIVITY_V2), keyspace, table, id.toString()); + } + /** * Writes the current read rates for a given SSTable to system.sstable_activity */ diff --git a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java index d7dad42b1674..fe8c537b61e3 100644 --- a/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java +++ b/src/java/org/apache/cassandra/io/sstable/format/SSTableReader.java @@ -2160,17 +2160,16 @@ void ensureReadMeter() readMeter = SystemKeyspace.getSSTableReadMeter(desc.ksname, desc.cfname, desc.id); // sync the average read rate to system.sstable_activity every five minutes, starting one minute from now - readMeterSyncFuture = new WeakReference<>(syncExecutor.scheduleAtFixedRate(new Runnable() + readMeterSyncFuture = new WeakReference<>(syncExecutor.scheduleAtFixedRate(this::maybePersistSSTableReadMeter, 1, 5, TimeUnit.MINUTES)); + } + + void maybePersistSSTableReadMeter() + { + if (obsoletion == null && DatabaseDescriptor.getSStableReadRatePersistenceEnabled()) { - public void run() - { - if (obsoletion == null) - { - meterSyncThrottle.acquire(); - SystemKeyspace.persistSSTableReadMeter(desc.ksname, desc.cfname, desc.id, readMeter); - } - } - }, 1, 5, TimeUnit.MINUTES)); + meterSyncThrottle.acquire(); + SystemKeyspace.persistSSTableReadMeter(desc.ksname, desc.cfname, desc.id, readMeter); + } } private void stopReadMeterPersistence() @@ -2389,4 +2388,10 @@ private long bytesOnDisk(boolean logical) } return bytes; } + + @VisibleForTesting + public void maybePersistSSTableReadMeter() + { + tidy.global.maybePersistSSTableReadMeter(); + } } diff --git a/src/java/org/apache/cassandra/service/StorageProxy.java b/src/java/org/apache/cassandra/service/StorageProxy.java index 2bf140fc2949..4a66b511be7a 100644 --- a/src/java/org/apache/cassandra/service/StorageProxy.java +++ b/src/java/org/apache/cassandra/service/StorageProxy.java @@ -3179,4 +3179,16 @@ public void setDumpHeapOnUncaughtException(boolean enabled) { DatabaseDescriptor.setDumpHeapOnUncaughtException(enabled); } + + @Override + public boolean getSStableReadRatePersistenceEnabled() + { + return DatabaseDescriptor.getSStableReadRatePersistenceEnabled(); + } + + @Override + public void setSStableReadRatePersistenceEnabled(boolean enabled) + { + DatabaseDescriptor.setSStableReadRatePersistenceEnabled(enabled); + } } diff --git a/src/java/org/apache/cassandra/service/StorageProxyMBean.java b/src/java/org/apache/cassandra/service/StorageProxyMBean.java index 5d7bc6956909..4a3adfd5bd4b 100644 --- a/src/java/org/apache/cassandra/service/StorageProxyMBean.java +++ b/src/java/org/apache/cassandra/service/StorageProxyMBean.java @@ -138,4 +138,7 @@ public interface StorageProxyMBean public boolean getDumpHeapOnUncaughtException(); public void setDumpHeapOnUncaughtException(boolean enabled); + + boolean getSStableReadRatePersistenceEnabled(); + void setSStableReadRatePersistenceEnabled(boolean enabled); } diff --git a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java index f064f19fd92f..95675410484b 100644 --- a/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java +++ b/test/unit/org/apache/cassandra/io/sstable/SSTableReaderTest.java @@ -26,7 +26,7 @@ import java.util.stream.Stream; import com.google.common.collect.Sets; -import org.apache.cassandra.io.util.File; + import org.junit.BeforeClass; import org.junit.Rule; import org.junit.Test; @@ -34,7 +34,9 @@ import org.apache.cassandra.SchemaLoader; import org.apache.cassandra.Util; +import org.apache.cassandra.config.DatabaseDescriptor; import org.apache.cassandra.cql3.Operator; +import org.apache.cassandra.cql3.UntypedResultSet; import org.apache.cassandra.db.*; import org.apache.cassandra.db.compaction.CompactionManager; import org.apache.cassandra.db.compaction.OperationType; @@ -50,6 +52,7 @@ import org.apache.cassandra.index.Index; import org.apache.cassandra.io.FSReadError; import org.apache.cassandra.io.sstable.format.SSTableReader; +import org.apache.cassandra.io.util.File; import org.apache.cassandra.io.util.FileDataInput; import org.apache.cassandra.io.util.MmappedRegions; import org.apache.cassandra.schema.CachingParams; @@ -59,6 +62,7 @@ import org.apache.cassandra.utils.ByteBufferUtil; import org.apache.cassandra.utils.FilterFactory; +import static java.lang.String.format; import static org.apache.cassandra.cql3.QueryProcessor.executeInternal; import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertFalse; @@ -178,7 +182,7 @@ public void testSpannedIndexPositions() throws IOException DecoratedKey dk = Util.dk(String.valueOf(j)); FileDataInput file = sstable.getFileDataInput(sstable.getPosition(dk, SSTableReader.Operator.EQ).position); DecoratedKey keyInDisk = sstable.decorateKey(ByteBufferUtil.readWithShortLength(file)); - assert keyInDisk.equals(dk) : String.format("%s != %s in %s", keyInDisk, dk, file.getPath()); + assert keyInDisk.equals(dk) : format("%s != %s in %s", keyInDisk, dk, file.getPath()); } // check no false positives @@ -238,15 +242,41 @@ public void testReadRateTracking() Util.flush(store); - SSTableReader sstable = store.getLiveSSTables().iterator().next(); - assertEquals(0, sstable.getReadMeter().count()); + boolean startState = DatabaseDescriptor.getSStableReadRatePersistenceEnabled(); + try + { + DatabaseDescriptor.setSStableReadRatePersistenceEnabled(true); + + SSTableReader sstable = store.getLiveSSTables().iterator().next(); + assertEquals(0, sstable.getReadMeter().count()); - DecoratedKey key = sstable.decorateKey(ByteBufferUtil.bytes("4")); - Util.getAll(Util.cmd(store, key).build()); - assertEquals(1, sstable.getReadMeter().count()); + DecoratedKey key = sstable.decorateKey(ByteBufferUtil.bytes("4")); + Util.getAll(Util.cmd(store, key).build()); + assertEquals(1, sstable.getReadMeter().count()); - Util.getAll(Util.cmd(store, key).includeRow("0").build()); - assertEquals(2, sstable.getReadMeter().count()); + Util.getAll(Util.cmd(store, key).includeRow("0").build()); + assertEquals(2, sstable.getReadMeter().count()); + + // With persistence enabled, we should be able to retrieve the state of the meter. + sstable.maybePersistSSTableReadMeter(); + + UntypedResultSet meter = SystemKeyspace.readSSTableActivity(store.keyspace.getName(), store.name, sstable.descriptor.id); + assertFalse(meter.isEmpty()); + + Util.getAll(Util.cmd(store, key).includeRow("0").build()); + assertEquals(3, sstable.getReadMeter().count()); + + // After cleaning existing state and disabling persistence, there should be no meter state to read. + SystemKeyspace.clearSSTableReadMeter(store.keyspace.getName(), store.name, sstable.descriptor.id); + DatabaseDescriptor.setSStableReadRatePersistenceEnabled(false); + sstable.maybePersistSSTableReadMeter(); + meter = SystemKeyspace.readSSTableActivity(store.keyspace.getName(), store.name, sstable.descriptor.id); + assertTrue(meter.isEmpty()); + } + finally + { + DatabaseDescriptor.setSStableReadRatePersistenceEnabled(startState); + } } @Test @@ -432,7 +462,7 @@ public void testOpeningSSTable() throws Exception assert target.first.equals(firstKey); assert target.last.equals(lastKey); - executeInternal(String.format("ALTER TABLE \"%s\".\"%s\" WITH bloom_filter_fp_chance = 0.3", ks, cf)); + executeInternal(format("ALTER TABLE \"%s\".\"%s\" WITH bloom_filter_fp_chance = 0.3", ks, cf)); File summaryFile = new File(desc.filenameFor(Component.SUMMARY)); Path bloomPath = new File(desc.filenameFor(Component.FILTER)).toPath(); @@ -613,9 +643,9 @@ public void testIndexSummaryReplacement() throws IOException, ExecutionException final int NUM_PARTITIONS = 512; for (int j = 0; j < NUM_PARTITIONS; j++) { - new RowUpdateBuilder(store.metadata(), j, String.format("%3d", j)) + new RowUpdateBuilder(store.metadata(), j, format("%3d", j)) .clustering("0") - .add("val", String.format("%3d", j)) + .add("val", format("%3d", j)) .build() .applyUnsafe(); @@ -631,7 +661,7 @@ public void testIndexSummaryReplacement() throws IOException, ExecutionException List> futures = new ArrayList<>(NUM_PARTITIONS * 2); for (int i = 0; i < NUM_PARTITIONS; i++) { - final ByteBuffer key = ByteBufferUtil.bytes(String.format("%3d", i)); + final ByteBuffer key = ByteBufferUtil.bytes(format("%3d", i)); final int index = i; futures.add(executor.submit(new Runnable() @@ -639,7 +669,7 @@ public void testIndexSummaryReplacement() throws IOException, ExecutionException public void run() { Row row = Util.getOnlyRowUnfiltered(Util.cmd(store, key).build()); - assertEquals(0, ByteBufferUtil.compare(String.format("%3d", index).getBytes(), row.cells().iterator().next().buffer())); + assertEquals(0, ByteBufferUtil.compare(format("%3d", index).getBytes(), row.cells().iterator().next().buffer())); } })); @@ -690,9 +720,9 @@ private void testIndexSummaryUpsampleAndReload0() throws Exception final int NUM_PARTITIONS = 512; for (int j = 0; j < NUM_PARTITIONS; j++) { - new RowUpdateBuilder(store.metadata(), j, String.format("%3d", j)) + new RowUpdateBuilder(store.metadata(), j, format("%3d", j)) .clustering("0") - .add("val", String.format("%3d", j)) + .add("val", format("%3d", j)) .build() .applyUnsafe(); @@ -791,7 +821,7 @@ public void testMoveAndOpenSSTable() throws IOException { File f = new File(notLiveDesc.filenameFor(c)); assertTrue(f.exists()); - assertTrue(f.toString().contains(String.format("-%s-", id))); + assertTrue(f.toString().contains(format("-%s-", id))); f.deleteOnExit(); assertFalse(new File(sstable.descriptor.filenameFor(c)).exists()); } From 85f113bce38278653ffca5139cd7871aee7fe786 Mon Sep 17 00:00:00 2001 From: "Claude Warren, Jr" Date: Fri, 19 Aug 2022 07:42:25 +0100 Subject: [PATCH 068/159] Add pull request template and modify README to include Jira and mailing list link patch by claudenw; reviewed by dritfx, dcapwell, josh-mckenzie, michaelsembwever and smiklosovic for CASSANDRA-17837 --- .build/build-rat.xml | 11 ++++++----- .github/pull_request_template.md | 25 +++++++++++++++++++++++++ README.asc | 8 ++++++-- 3 files changed, 37 insertions(+), 7 deletions(-) create mode 100644 .github/pull_request_template.md diff --git a/.build/build-rat.xml b/.build/build-rat.xml index 6a3d72e1ca45..522dbd24be7d 100644 --- a/.build/build-rat.xml +++ b/.build/build-rat.xml @@ -49,13 +49,14 @@ + - + @@ -80,12 +81,12 @@ - - + + - + - + diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 000000000000..abcd2168a0b3 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,25 @@ +Thanks for sending a pull request! Here are some tips if you're new here: + + * Ensure you have added or run the [appropriate tests](https://cassandra.apache.org/_/development/testing.html) for your PR. + * Be sure to keep the PR description updated to reflect all changes. + * Write your PR title to summarize what this PR proposes. + * If possible, provide a concise example to reproduce the issue for a faster review. + * Read our [contributor guidelines](https://cassandra.apache.org/_/development/index.html) + * If you're making a documentation change, see our [guide to documentation contribution](https://cassandra.apache.org/_/development/documentation.html) + +Commit messages should follow the following format: + +``` + + + + +patch by ; reviewed by for CASSANDRA-##### + +Co-authored-by: Name1 +Co-authored-by: Name2 + +``` + +The [Cassandra Jira](https://issues.apache.org/jira/projects/CASSANDRA/issues/) + diff --git a/README.asc b/README.asc index cba3a2b42450..decb6ba0e3dd 100644 --- a/README.asc +++ b/README.asc @@ -9,6 +9,8 @@ https://cwiki.apache.org/confluence/display/CASSANDRA2/DataModel[Row store] mean For more information, see http://cassandra.apache.org/[the Apache Cassandra web site]. +Issues should be reported on https://issues.apache.org/jira/projects/CASSANDRA/issues/[The Cassandra Jira]. + Requirements ------------ . Java >= 1.8 (OpenJDK and Oracle JVMS have been tested) @@ -76,8 +78,10 @@ reasonable way to think of it is as, "SQL minus joins and subqueries, plus colle Wondering where to go from here? - * Join us in #cassandra on the https://s.apache.org/slack-invite[ASF Slack] and ask questions + * Join us in #cassandra on the https://s.apache.org/slack-invite[ASF Slack] and ask questions. * Subscribe to the Users mailing list by sending a mail to - user-subscribe@cassandra.apache.org + user-subscribe@cassandra.apache.org. + * Subscribe to the Developer mailing list by sending a mail to + dev-subscribe@cassandra.apache.org. * Visit the http://cassandra.apache.org/community/[community section] of the Cassandra website for more information on getting involved. * Visit the http://cassandra.apache.org/doc/latest/development/index.html[development section] of the Cassandra website for more information on how to contribute. From ecceb446e00ea9e567ba45f1b422cb04862ef044 Mon Sep 17 00:00:00 2001 From: Brad Schoening <5796692+bschoening@users.noreply.github.com> Date: Wed, 14 Sep 2022 14:28:10 +0300 Subject: [PATCH 069/159] Refactor internals of cqlsh.py to cqlshlib patch by Brad Schoening; reviewed by Stefan Miklosovic and Brandon Williams for CASSANDRA-17531 --- bin/cqlsh.py | 2321 +--------------------------------- pylib/cqlshlib/cqlshmain.py | 2353 +++++++++++++++++++++++++++++++++++ 2 files changed, 2355 insertions(+), 2319 deletions(-) create mode 100755 pylib/cqlshlib/cqlshmain.py diff --git a/bin/cqlsh.py b/bin/cqlsh.py index 9a561eb20045..7e6db9bdb8f0 100755 --- a/bin/cqlsh.py +++ b/bin/cqlsh.py @@ -16,26 +16,10 @@ # See the License for the specific language governing permissions and # limitations under the License. -import cmd -import codecs -import configparser -import csv -import errno -import getpass -import optparse import os import platform -import re -import stat -import subprocess import sys -import traceback -import warnings -import webbrowser -from contextlib import contextmanager from glob import glob -from io import StringIO -from uuid import UUID if sys.version_info < (3, 6): sys.exit("\ncqlsh requires Python 3.6+\n") @@ -44,50 +28,9 @@ if platform.python_implementation().startswith('Jython'): sys.exit("\nCQL Shell does not run on Jython\n") -UTF8 = 'utf-8' - -description = "CQL Shell for Apache Cassandra" -version = "6.2.0" - -readline = None -try: - # check if tty first, cause readline doesn't check, and only cares - # about $TERM. we don't want the funky escape code stuff to be - # output if not a tty. - if sys.stdin.isatty(): - import readline -except ImportError: - pass - CQL_LIB_PREFIX = 'cassandra-driver-internal-only-' CASSANDRA_PATH = os.path.join(os.path.dirname(os.path.realpath(__file__)), '..') -CASSANDRA_CQL_HTML_FALLBACK = 'https://cassandra.apache.org/doc/latest/cql/index.html' - -# default location of local CQL.html -if os.path.exists(CASSANDRA_PATH + '/doc/cql3/CQL.html'): - # default location of local CQL.html - CASSANDRA_CQL_HTML = 'file://' + CASSANDRA_PATH + '/doc/cql3/CQL.html' -elif os.path.exists('/usr/share/doc/cassandra/CQL.html'): - # fallback to package file - CASSANDRA_CQL_HTML = 'file:///usr/share/doc/cassandra/CQL.html' -else: - # fallback to online version - CASSANDRA_CQL_HTML = CASSANDRA_CQL_HTML_FALLBACK - -# On Linux, the Python webbrowser module uses the 'xdg-open' executable -# to open a file/URL. But that only works, if the current session has been -# opened from _within_ a desktop environment. I.e. 'xdg-open' will fail, -# if the session's been opened via ssh to a remote box. -# -try: - webbrowser.register_standard_browsers() # registration is otherwise lazy in Python3 -except AttributeError: - pass -if webbrowser._tryorder and webbrowser._tryorder[0] == 'xdg-open' and os.environ.get('XDG_DATA_DIRS', '') == '': - # only on Linux (some OS with xdg-open) - webbrowser._tryorder.remove('xdg-open') - webbrowser._tryorder.append('xdg-open') # use bundled lib for python-cql if available. if there # is a ../lib dir, use bundled libs there preferentially. @@ -120,7 +63,6 @@ def find_zip(libprefix): if lib_zip: sys.path.insert(0, lib_zip) -warnings.filterwarnings("ignore", r".*blist.*") try: import cassandra except ImportError as e: @@ -130,14 +72,6 @@ def find_zip(libprefix): 'Module load path: %r\n\n' 'Error: %s\n' % (sys.executable, sys.path, e)) -from cassandra.auth import PlainTextAuthProvider -from cassandra.cluster import Cluster -from cassandra.cqltypes import cql_typename -from cassandra.marshal import int64_unpack -from cassandra.metadata import (ColumnMetadata, KeyspaceMetadata, TableMetadata) -from cassandra.policies import WhiteListRoundRobinPolicy -from cassandra.query import SimpleStatement, ordered_dict_factory, TraceUnavailable -from cassandra.util import datetime_from_timestamp # cqlsh should run correctly when run out of a Cassandra source tree, # out of an unpacked Cassandra tarball, and after a proper package install. @@ -145,2261 +79,10 @@ def find_zip(libprefix): if os.path.isdir(cqlshlibdir): sys.path.insert(0, cqlshlibdir) -from cqlshlib import cql3handling, pylexotron, sslhandling, cqlshhandling, authproviderhandling -from cqlshlib.copyutil import ExportTask, ImportTask -from cqlshlib.displaying import (ANSI_RESET, BLUE, COLUMN_NAME_COLORS, CYAN, - RED, WHITE, FormattedValue, colorme) -from cqlshlib.formatting import (DEFAULT_DATE_FORMAT, DEFAULT_NANOTIME_FORMAT, - DEFAULT_TIMESTAMP_FORMAT, CqlType, DateTimeFormat, - format_by_type) -from cqlshlib.tracing import print_trace, print_trace_session -from cqlshlib.util import get_file_encoding_bomsize -from cqlshlib.util import is_file_secure - - -DEFAULT_HOST = '127.0.0.1' -DEFAULT_PORT = 9042 -DEFAULT_SSL = False -DEFAULT_CONNECT_TIMEOUT_SECONDS = 5 -DEFAULT_REQUEST_TIMEOUT_SECONDS = 10 - -DEFAULT_FLOAT_PRECISION = 5 -DEFAULT_DOUBLE_PRECISION = 5 -DEFAULT_MAX_TRACE_WAIT = 10 - -if readline is not None and readline.__doc__ is not None and 'libedit' in readline.__doc__: - DEFAULT_COMPLETEKEY = '\t' -else: - DEFAULT_COMPLETEKEY = 'tab' - -cqldocs = None -cqlruleset = None - -epilog = """Connects to %(DEFAULT_HOST)s:%(DEFAULT_PORT)d by default. These -defaults can be changed by setting $CQLSH_HOST and/or $CQLSH_PORT. When a -host (and optional port number) are given on the command line, they take -precedence over any defaults.""" % globals() - -parser = optparse.OptionParser(description=description, epilog=epilog, - usage="Usage: %prog [options] [host [port]]", - version='cqlsh ' + version) -parser.add_option("-C", "--color", action='store_true', dest='color', - help='Always use color output') -parser.add_option("--no-color", action='store_false', dest='color', - help='Never use color output') -parser.add_option("--browser", dest='browser', help="""The browser to use to display CQL help, where BROWSER can be: - - one of the supported browsers in https://docs.python.org/3/library/webbrowser.html. - - browser path followed by %s, example: /usr/bin/google-chrome-stable %s""") -parser.add_option('--ssl', action='store_true', help='Use SSL', default=False) -parser.add_option("-u", "--username", help="Authenticate as user.") -parser.add_option("-p", "--password", help="Authenticate using password.") -parser.add_option('-k', '--keyspace', help='Authenticate to the given keyspace.') -parser.add_option("-f", "--file", help="Execute commands from FILE, then exit") -parser.add_option('--debug', action='store_true', - help='Show additional debugging information') -parser.add_option('--coverage', action='store_true', - help='Collect coverage data') -parser.add_option("--encoding", help="Specify a non-default encoding for output." - + " (Default: %s)" % (UTF8,)) -parser.add_option("--cqlshrc", help="Specify an alternative cqlshrc file location.") -parser.add_option("--credentials", help="Specify an alternative credentials file location.") -parser.add_option('--cqlversion', default=None, - help='Specify a particular CQL version, ' - 'by default the highest version supported by the server will be used.' - ' Examples: "3.0.3", "3.1.0"') -parser.add_option("--protocol-version", type="int", default=None, - help='Specify a specific protcol version otherwise the client will default and downgrade as necessary') - -parser.add_option("-e", "--execute", help='Execute the statement and quit.') -parser.add_option("--connect-timeout", default=DEFAULT_CONNECT_TIMEOUT_SECONDS, dest='connect_timeout', - help='Specify the connection timeout in seconds (default: %default seconds).') -parser.add_option("--request-timeout", default=DEFAULT_REQUEST_TIMEOUT_SECONDS, dest='request_timeout', - help='Specify the default request timeout in seconds (default: %default seconds).') -parser.add_option("-t", "--tty", action='store_true', dest='tty', - help='Force tty mode (command prompt).') -parser.add_option('-v', action="version", help='Print the current version of cqlsh.') - -# This is a hidden option to suppress the warning when the -p/--password command line option is used. -# Power users may use this option if they know no other people has access to the system where cqlsh is run or don't care about security. -# Use of this option in scripting is discouraged. Please use a (temporary) credentials file where possible. -# The Cassandra distributed tests (dtests) also use this option in some tests when a well-known password is supplied via the command line. -parser.add_option("--insecure-password-without-warning", action='store_true', dest='insecure_password_without_warning', - help=optparse.SUPPRESS_HELP) - -opt_values = optparse.Values() -(options, arguments) = parser.parse_args(sys.argv[1:], values=opt_values) - -# BEGIN history/config definition - - -def mkdirp(path): - """Creates all parent directories up to path parameter or fails when path exists, but it is not a directory.""" - - try: - os.makedirs(path) - except OSError: - if not os.path.isdir(path): - raise - - -def resolve_cql_history_file(): - default_cql_history = os.path.expanduser(os.path.join('~', '.cassandra', 'cqlsh_history')) - if 'CQL_HISTORY' in os.environ: - return os.environ['CQL_HISTORY'] - else: - return default_cql_history - - -HISTORY = resolve_cql_history_file() -HISTORY_DIR = os.path.dirname(HISTORY) - -try: - mkdirp(HISTORY_DIR) -except OSError: - print('\nWarning: Cannot create directory at `%s`. Command history will not be saved. Please check what was the environment property CQL_HISTORY set to.\n' % HISTORY_DIR) - -DEFAULT_CQLSHRC = os.path.expanduser(os.path.join('~', '.cassandra', 'cqlshrc')) - -if hasattr(options, 'cqlshrc'): - CONFIG_FILE = os.path.expanduser(options.cqlshrc) - if not os.path.exists(CONFIG_FILE): - print('\nWarning: Specified cqlshrc location `%s` does not exist. Using `%s` instead.\n' % (CONFIG_FILE, DEFAULT_CQLSHRC)) - CONFIG_FILE = DEFAULT_CQLSHRC -else: - CONFIG_FILE = DEFAULT_CQLSHRC - -CQL_DIR = os.path.dirname(CONFIG_FILE) - -CQL_ERRORS = ( - cassandra.AlreadyExists, cassandra.AuthenticationFailed, cassandra.CoordinationFailure, - cassandra.InvalidRequest, cassandra.Timeout, cassandra.Unauthorized, cassandra.OperationTimedOut, - cassandra.cluster.NoHostAvailable, - cassandra.connection.ConnectionBusy, cassandra.connection.ProtocolError, cassandra.connection.ConnectionException, - cassandra.protocol.ErrorMessage, cassandra.protocol.InternalError, cassandra.query.TraceUnavailable -) - -debug_completion = bool(os.environ.get('CQLSH_DEBUG_COMPLETION', '') == 'YES') - - -class NoKeyspaceError(Exception): - pass - - -class KeyspaceNotFound(Exception): - pass - - -class ColumnFamilyNotFound(Exception): - pass - - -class IndexNotFound(Exception): - pass - - -class MaterializedViewNotFound(Exception): - pass - - -class ObjectNotFound(Exception): - pass - - -class VersionNotSupported(Exception): - pass - - -class UserTypeNotFound(Exception): - pass - - -class FunctionNotFound(Exception): - pass - - -class AggregateNotFound(Exception): - pass - - -class DecodeError(Exception): - verb = 'decode' - - def __init__(self, thebytes, err, colname=None): - self.thebytes = thebytes - self.err = err - self.colname = colname - - def __str__(self): - return str(self.thebytes) - - def message(self): - what = 'value %r' % (self.thebytes,) - if self.colname is not None: - what = 'value %r (for column %r)' % (self.thebytes, self.colname) - return 'Failed to %s %s : %s' \ - % (self.verb, what, self.err) - - def __repr__(self): - return '<%s %s>' % (self.__class__.__name__, self.message()) - - -def maybe_ensure_text(val): - return str(val) if val else val - - -class FormatError(DecodeError): - verb = 'format' - - -def full_cql_version(ver): - while ver.count('.') < 2: - ver += '.0' - ver_parts = ver.split('-', 1) + [''] - vertuple = tuple(list(map(int, ver_parts[0].split('.'))) + [ver_parts[1]]) - return ver, vertuple - - -def format_value(val, cqltype, encoding, addcolor=False, date_time_format=None, - float_precision=None, colormap=None, nullval=None): - if isinstance(val, DecodeError): - if addcolor: - return colorme(repr(val.thebytes), colormap, 'error') - else: - return FormattedValue(repr(val.thebytes)) - return format_by_type(val, cqltype=cqltype, encoding=encoding, colormap=colormap, - addcolor=addcolor, nullval=nullval, date_time_format=date_time_format, - float_precision=float_precision) - - -def show_warning_without_quoting_line(message, category, filename, lineno, file=None, line=None): - if file is None: - file = sys.stderr - try: - file.write(warnings.formatwarning(message, category, filename, lineno, line='')) - except IOError: - pass - - -warnings.showwarning = show_warning_without_quoting_line -warnings.filterwarnings('always', category=cql3handling.UnexpectedTableStructure) - - -def insert_driver_hooks(): - - class DateOverFlowWarning(RuntimeWarning): - pass - - # Native datetime types blow up outside of datetime.[MIN|MAX]_YEAR. We will fall back to an int timestamp - def deserialize_date_fallback_int(byts, protocol_version): - timestamp_ms = int64_unpack(byts) - try: - return datetime_from_timestamp(timestamp_ms / 1000.0) - except OverflowError: - warnings.warn(DateOverFlowWarning("Some timestamps are larger than Python datetime can represent. " - "Timestamps are displayed in milliseconds from epoch.")) - return timestamp_ms - - cassandra.cqltypes.DateType.deserialize = staticmethod(deserialize_date_fallback_int) - - if hasattr(cassandra, 'deserializers'): - del cassandra.deserializers.DesDateType - - # Return cassandra.cqltypes.EMPTY instead of None for empty values - cassandra.cqltypes.CassandraType.support_empty_values = True - - -class Shell(cmd.Cmd): - custom_prompt = os.getenv('CQLSH_PROMPT', '') - if custom_prompt != '': - custom_prompt += "\n" - default_prompt = custom_prompt + "cqlsh> " - continue_prompt = " ... " - keyspace_prompt = custom_prompt + "cqlsh:{}> " - keyspace_continue_prompt = "{} ... " - show_line_nums = False - debug = False - coverage = False - coveragerc_path = None - stop = False - last_hist = None - shunted_query_out = None - use_paging = True - - default_page_size = 100 - - def __init__(self, hostname, port, color=False, - username=None, encoding=None, stdin=None, tty=True, - completekey=DEFAULT_COMPLETEKEY, browser=None, use_conn=None, - cqlver=None, keyspace=None, - tracing_enabled=False, expand_enabled=False, - display_nanotime_format=DEFAULT_NANOTIME_FORMAT, - display_timestamp_format=DEFAULT_TIMESTAMP_FORMAT, - display_date_format=DEFAULT_DATE_FORMAT, - display_float_precision=DEFAULT_FLOAT_PRECISION, - display_double_precision=DEFAULT_DOUBLE_PRECISION, - display_timezone=None, - max_trace_wait=DEFAULT_MAX_TRACE_WAIT, - ssl=False, - single_statement=None, - request_timeout=DEFAULT_REQUEST_TIMEOUT_SECONDS, - protocol_version=None, - connect_timeout=DEFAULT_CONNECT_TIMEOUT_SECONDS, - is_subshell=False, - auth_provider=None): - cmd.Cmd.__init__(self, completekey=completekey) - self.hostname = hostname - self.port = port - self.auth_provider = auth_provider - self.username = username - - if isinstance(auth_provider, PlainTextAuthProvider): - self.username = auth_provider.username - if not auth_provider.password: - # if no password is provided, we need to query the user to get one. - password = getpass.getpass() - self.auth_provider = PlainTextAuthProvider(username=auth_provider.username, password=password) - - self.keyspace = keyspace - self.ssl = ssl - self.tracing_enabled = tracing_enabled - self.page_size = self.default_page_size - self.expand_enabled = expand_enabled - if use_conn: - self.conn = use_conn - else: - kwargs = {} - if protocol_version is not None: - kwargs['protocol_version'] = protocol_version - self.conn = Cluster(contact_points=(self.hostname,), port=self.port, cql_version=cqlver, - auth_provider=self.auth_provider, - ssl_options=sslhandling.ssl_settings(hostname, CONFIG_FILE) if ssl else None, - load_balancing_policy=WhiteListRoundRobinPolicy([self.hostname]), - control_connection_timeout=connect_timeout, - connect_timeout=connect_timeout, - **kwargs) - self.owns_connection = not use_conn - - if keyspace: - self.session = self.conn.connect(keyspace) - else: - self.session = self.conn.connect() - - if browser == "": - browser = None - self.browser = browser - self.color = color - - self.display_nanotime_format = display_nanotime_format - self.display_timestamp_format = display_timestamp_format - self.display_date_format = display_date_format - - self.display_float_precision = display_float_precision - self.display_double_precision = display_double_precision - - self.display_timezone = display_timezone - - self.session.default_timeout = request_timeout - self.session.row_factory = ordered_dict_factory - self.session.default_consistency_level = cassandra.ConsistencyLevel.ONE - self.get_connection_versions() - self.set_expanded_cql_version(self.connection_versions['cql']) - - self.current_keyspace = keyspace - - self.max_trace_wait = max_trace_wait - self.session.max_trace_wait = max_trace_wait - - self.tty = tty - self.encoding = encoding - - self.output_codec = codecs.lookup(encoding) - - self.statement = StringIO() - self.lineno = 1 - self.in_comment = False - - self.prompt = '' - if stdin is None: - stdin = sys.stdin - - if tty: - self.reset_prompt() - self.report_connection() - print('Use HELP for help.') - else: - self.show_line_nums = True - self.stdin = stdin - self.query_out = sys.stdout - self.consistency_level = cassandra.ConsistencyLevel.ONE - self.serial_consistency_level = cassandra.ConsistencyLevel.SERIAL - - self.empty_lines = 0 - self.statement_error = False - self.single_statement = single_statement - self.is_subshell = is_subshell - - @property - def batch_mode(self): - return not self.tty - - def set_expanded_cql_version(self, ver): - ver, vertuple = full_cql_version(ver) - self.cql_version = ver - self.cql_ver_tuple = vertuple - - def cqlver_atleast(self, major, minor=0, patch=0): - return self.cql_ver_tuple[:3] >= (major, minor, patch) - - def myformat_value(self, val, cqltype=None, **kwargs): - if isinstance(val, DecodeError): - self.decoding_errors.append(val) - try: - dtformats = DateTimeFormat(timestamp_format=self.display_timestamp_format, - date_format=self.display_date_format, nanotime_format=self.display_nanotime_format, - timezone=self.display_timezone) - precision = self.display_double_precision if cqltype is not None and cqltype.type_name == 'double' \ - else self.display_float_precision - return format_value(val, cqltype=cqltype, encoding=self.output_codec.name, - addcolor=self.color, date_time_format=dtformats, - float_precision=precision, **kwargs) - except Exception as e: - err = FormatError(val, e) - self.decoding_errors.append(err) - return format_value(err, cqltype=cqltype, encoding=self.output_codec.name, addcolor=self.color) - - def myformat_colname(self, name, table_meta=None): - column_colors = COLUMN_NAME_COLORS.copy() - # check column role and color appropriately - if table_meta: - if name in [col.name for col in table_meta.partition_key]: - column_colors.default_factory = lambda: RED - elif name in [col.name for col in table_meta.clustering_key]: - column_colors.default_factory = lambda: CYAN - elif name in table_meta.columns and table_meta.columns[name].is_static: - column_colors.default_factory = lambda: WHITE - return self.myformat_value(name, colormap=column_colors) - - def report_connection(self): - self.show_host() - self.show_version() - - def show_host(self): - print("Connected to {0} at {1}:{2}" - .format(self.applycolor(self.get_cluster_name(), BLUE), - self.hostname, - self.port)) - - def show_version(self): - vers = self.connection_versions.copy() - vers['shver'] = version - # system.Versions['cql'] apparently does not reflect changes with - # set_cql_version. - vers['cql'] = self.cql_version - print("[cqlsh %(shver)s | Cassandra %(build)s | CQL spec %(cql)s | Native protocol v%(protocol)s]" % vers) - - def show_session(self, sessionid, partial_session=False): - print_trace_session(self, self.session, sessionid, partial_session) - - def show_replicas(self, token_value, keyspace=None): - ks = self.current_keyspace if keyspace is None else keyspace - token_map = self.conn.metadata.token_map - nodes = token_map.get_replicas(ks, token_map.token_class(token_value)) - addresses = [x.address for x in nodes] - print(f"{addresses}") - - def get_connection_versions(self): - result, = self.session.execute("select * from system.local where key = 'local'") - vers = { - 'build': result['release_version'], - 'protocol': self.conn.protocol_version, - 'cql': result['cql_version'], - } - self.connection_versions = vers - - def get_keyspace_names(self): - return list(self.conn.metadata.keyspaces) - - def get_columnfamily_names(self, ksname=None): - if ksname is None: - ksname = self.current_keyspace - - return list(self.get_keyspace_meta(ksname).tables) - - def get_materialized_view_names(self, ksname=None): - if ksname is None: - ksname = self.current_keyspace - - return list(self.get_keyspace_meta(ksname).views) - - def get_index_names(self, ksname=None): - if ksname is None: - ksname = self.current_keyspace - - return list(self.get_keyspace_meta(ksname).indexes) - - def get_column_names(self, ksname, cfname): - if ksname is None: - ksname = self.current_keyspace - layout = self.get_table_meta(ksname, cfname) - return list(layout.columns) - - def get_usertype_names(self, ksname=None): - if ksname is None: - ksname = self.current_keyspace - - return list(self.get_keyspace_meta(ksname).user_types) - - def get_usertype_layout(self, ksname, typename): - if ksname is None: - ksname = self.current_keyspace - - ks_meta = self.get_keyspace_meta(ksname) - - try: - user_type = ks_meta.user_types[typename] - except KeyError: - raise UserTypeNotFound("User type {!r} not found".format(typename)) - - return list(zip(user_type.field_names, user_type.field_types)) - - def get_userfunction_names(self, ksname=None): - if ksname is None: - ksname = self.current_keyspace - - return [f.name for f in list(self.get_keyspace_meta(ksname).functions.values())] - - def get_useraggregate_names(self, ksname=None): - if ksname is None: - ksname = self.current_keyspace - - return [f.name for f in list(self.get_keyspace_meta(ksname).aggregates.values())] - - def get_cluster_name(self): - return self.conn.metadata.cluster_name - - def get_partitioner(self): - return self.conn.metadata.partitioner - - def get_keyspace_meta(self, ksname): - if ksname in self.conn.metadata.keyspaces: - return self.conn.metadata.keyspaces[ksname] - - raise KeyspaceNotFound('Keyspace %r not found.' % ksname) - - def get_keyspaces(self): - return list(self.conn.metadata.keyspaces.values()) - - def get_ring(self, ks): - self.conn.metadata.token_map.rebuild_keyspace(ks, build_if_absent=True) - return self.conn.metadata.token_map.tokens_to_hosts_by_ks[ks] - - def get_table_meta(self, ksname, tablename): - if ksname is None: - ksname = self.current_keyspace - ksmeta = self.get_keyspace_meta(ksname) - if tablename not in ksmeta.tables: - if ksname == 'system_auth' and tablename in ['roles', 'role_permissions']: - self.get_fake_auth_table_meta(ksname, tablename) - else: - raise ColumnFamilyNotFound("Column family {} not found".format(tablename)) - else: - return ksmeta.tables[tablename] - - def get_fake_auth_table_meta(self, ksname, tablename): - # may be using external auth implementation so internal tables - # aren't actually defined in schema. In this case, we'll fake - # them up - if tablename == 'roles': - ks_meta = KeyspaceMetadata(ksname, True, None, None) - table_meta = TableMetadata(ks_meta, 'roles') - table_meta.columns['role'] = ColumnMetadata(table_meta, 'role', cassandra.cqltypes.UTF8Type) - table_meta.columns['is_superuser'] = ColumnMetadata(table_meta, 'is_superuser', cassandra.cqltypes.BooleanType) - table_meta.columns['can_login'] = ColumnMetadata(table_meta, 'can_login', cassandra.cqltypes.BooleanType) - elif tablename == 'role_permissions': - ks_meta = KeyspaceMetadata(ksname, True, None, None) - table_meta = TableMetadata(ks_meta, 'role_permissions') - table_meta.columns['role'] = ColumnMetadata(table_meta, 'role', cassandra.cqltypes.UTF8Type) - table_meta.columns['resource'] = ColumnMetadata(table_meta, 'resource', cassandra.cqltypes.UTF8Type) - table_meta.columns['permission'] = ColumnMetadata(table_meta, 'permission', cassandra.cqltypes.UTF8Type) - else: - raise ColumnFamilyNotFound("Column family {} not found".format(tablename)) - - def get_index_meta(self, ksname, idxname): - if ksname is None: - ksname = self.current_keyspace - ksmeta = self.get_keyspace_meta(ksname) - - if idxname not in ksmeta.indexes: - raise IndexNotFound("Index {} not found".format(idxname)) - - return ksmeta.indexes[idxname] - - def get_view_meta(self, ksname, viewname): - if ksname is None: - ksname = self.current_keyspace - ksmeta = self.get_keyspace_meta(ksname) - - if viewname not in ksmeta.views: - raise MaterializedViewNotFound("Materialized view '{}' not found".format(viewname)) - return ksmeta.views[viewname] - - def get_object_meta(self, ks, name): - if name is None: - if ks and ks in self.conn.metadata.keyspaces: - return self.conn.metadata.keyspaces[ks] - elif self.current_keyspace is None: - raise ObjectNotFound("'{}' not found in keyspaces".format(ks)) - else: - name = ks - ks = self.current_keyspace - - if ks is None: - ks = self.current_keyspace - - ksmeta = self.get_keyspace_meta(ks) - - if name in ksmeta.tables: - return ksmeta.tables[name] - elif name in ksmeta.indexes: - return ksmeta.indexes[name] - elif name in ksmeta.views: - return ksmeta.views[name] - - raise ObjectNotFound("'{}' not found in keyspace '{}'".format(name, ks)) - - def get_trigger_names(self, ksname=None): - if ksname is None: - ksname = self.current_keyspace - - return [trigger.name - for table in list(self.get_keyspace_meta(ksname).tables.values()) - for trigger in list(table.triggers.values())] - - def reset_statement(self): - self.reset_prompt() - self.statement.truncate(0) - self.statement.seek(0) - self.empty_lines = 0 - - def reset_prompt(self): - if self.current_keyspace is None: - self.set_prompt(self.default_prompt, True) - else: - self.set_prompt(self.keyspace_prompt.format(self.current_keyspace), True) - - def set_continue_prompt(self): - if self.empty_lines >= 3: - self.set_prompt("Statements are terminated with a ';'. You can press CTRL-C to cancel an incomplete statement.") - self.empty_lines = 0 - return - if self.current_keyspace is None: - self.set_prompt(self.continue_prompt) - else: - spaces = ' ' * len(str(self.current_keyspace)) - self.set_prompt(self.keyspace_continue_prompt.format(spaces)) - self.empty_lines = self.empty_lines + 1 if not self.lastcmd else 0 - - @contextmanager - def prepare_loop(self): - readline = None - if self.tty and self.completekey: - try: - import readline - except ImportError: - pass - else: - old_completer = readline.get_completer() - readline.set_completer(self.complete) - if readline.__doc__ is not None and 'libedit' in readline.__doc__: - readline.parse_and_bind("bind -e") - readline.parse_and_bind("bind '" + self.completekey + "' rl_complete") - readline.parse_and_bind("bind ^R em-inc-search-prev") - else: - readline.parse_and_bind(self.completekey + ": complete") - # start coverage collection if requested, unless in subshell - if self.coverage and not self.is_subshell: - # check for coveragerc file, write it if missing - if os.path.exists(CQL_DIR): - self.coveragerc_path = os.path.join(CQL_DIR, '.coveragerc') - covdata_path = os.path.join(CQL_DIR, '.coverage') - if not os.path.isfile(self.coveragerc_path): - with open(self.coveragerc_path, 'w') as f: - f.writelines(["[run]\n", - "concurrency = multiprocessing\n", - "data_file = {}\n".format(covdata_path), - "parallel = true\n"] - ) - # start coverage - import coverage - self.cov = coverage.Coverage(config_file=self.coveragerc_path) - self.cov.start() - try: - yield - finally: - if readline is not None: - readline.set_completer(old_completer) - if self.coverage and not self.is_subshell: - self.stop_coverage() - - def get_input_line(self, prompt=''): - if self.tty: - self.lastcmd = input(str(prompt)) - line = self.lastcmd + '\n' - else: - self.lastcmd = self.stdin.readline() - line = self.lastcmd - if not len(line): - raise EOFError - self.lineno += 1 - return line - - def use_stdin_reader(self, until='', prompt=''): - until += '\n' - while True: - try: - newline = self.get_input_line(prompt=prompt) - except EOFError: - return - if newline == until: - return - yield newline - - def cmdloop(self, intro=None): - """ - Adapted from cmd.Cmd's version, because there is literally no way with - cmd.Cmd.cmdloop() to tell the difference between "EOF" showing up in - input and an actual EOF. - """ - with self.prepare_loop(): - while not self.stop: - try: - if self.single_statement: - line = self.single_statement - self.stop = True - else: - line = self.get_input_line(self.prompt) - self.statement.write(line) - if self.onecmd(self.statement.getvalue()): - self.reset_statement() - except EOFError: - self.handle_eof() - except CQL_ERRORS as cqlerr: - self.printerr(cqlerr.message) - except KeyboardInterrupt: - self.reset_statement() - print('') - - def strip_comment_blocks(self, statementtext): - comment_block_in_literal_string = re.search('["].*[/][*].*[*][/].*["]', statementtext) - if not comment_block_in_literal_string: - result = re.sub('[/][*].*[*][/]', "", statementtext) - if '*/' in result and '/*' not in result and not self.in_comment: - raise SyntaxError("Encountered comment block terminator without being in comment block") - if '/*' in result: - result = re.sub('[/][*].*', "", result) - self.in_comment = True - if '*/' in result: - result = re.sub('.*[*][/]', "", result) - self.in_comment = False - if self.in_comment and not re.findall('[/][*]|[*][/]', statementtext): - result = '' - return result - return statementtext - - def onecmd(self, statementtext): - """ - Returns true if the statement is complete and was handled (meaning it - can be reset). - """ - statementtext = self.strip_comment_blocks(statementtext) - try: - statements, endtoken_escaped = cqlruleset.cql_split_statements(statementtext) - except pylexotron.LexingError as e: - if self.show_line_nums: - self.printerr('Invalid syntax at line {0}, char {1}' - .format(e.linenum, e.charnum)) - else: - self.printerr('Invalid syntax at char {0}'.format(e.charnum)) - statementline = statementtext.split('\n')[e.linenum - 1] - self.printerr(' {0}'.format(statementline)) - self.printerr(' {0}^'.format(' ' * e.charnum)) - return True - - while statements and not statements[-1]: - statements = statements[:-1] - if not statements: - return True - if endtoken_escaped or statements[-1][-1][0] != 'endtoken': - self.set_continue_prompt() - return - for st in statements: - try: - self.handle_statement(st, statementtext) - except Exception as e: - if self.debug: - traceback.print_exc() - else: - self.printerr(e) - return True - - def handle_eof(self): - if self.tty: - print('') - statement = self.statement.getvalue() - if statement.strip(): - if not self.onecmd(statement): - self.printerr('Incomplete statement at end of file') - self.do_exit() - - def handle_statement(self, tokens, srcstr): - # Concat multi-line statements and insert into history - if readline is not None: - nl_count = srcstr.count("\n") - - new_hist = srcstr.replace("\n", " ").rstrip() - - if nl_count > 1 and self.last_hist != new_hist: - readline.add_history(new_hist) - - self.last_hist = new_hist - cmdword = tokens[0][1] - if cmdword == '?': - cmdword = 'help' - custom_handler = getattr(self, 'do_' + cmdword.lower(), None) - if custom_handler: - parsed = cqlruleset.cql_whole_parse_tokens(tokens, srcstr=srcstr, - startsymbol='cqlshCommand') - if parsed and not parsed.remainder: - # successful complete parse - return custom_handler(parsed) - else: - return self.handle_parse_error(cmdword, tokens, parsed, srcstr) - return self.perform_statement(cqlruleset.cql_extract_orig(tokens, srcstr)) - - def handle_parse_error(self, cmdword, tokens, parsed, srcstr): - if cmdword.lower() in ('select', 'insert', 'update', 'delete', 'truncate', - 'create', 'drop', 'alter', 'grant', 'revoke', - 'batch', 'list'): - # hey, maybe they know about some new syntax we don't. type - # assumptions won't work, but maybe the query will. - return self.perform_statement(cqlruleset.cql_extract_orig(tokens, srcstr)) - if parsed: - self.printerr('Improper %s command (problem at %r).' % (cmdword, parsed.remainder[0])) - else: - self.printerr(f'Improper {cmdword} command.') - - def do_use(self, parsed): - ksname = parsed.get_binding('ksname') - success, _ = self.perform_simple_statement(SimpleStatement(parsed.extract_orig())) - if success: - if ksname[0] == '"' and ksname[-1] == '"': - self.current_keyspace = self.cql_unprotect_name(ksname) - else: - self.current_keyspace = ksname.lower() - - def do_select(self, parsed): - tracing_was_enabled = self.tracing_enabled - ksname = parsed.get_binding('ksname') - stop_tracing = ksname == 'system_traces' or (ksname is None and self.current_keyspace == 'system_traces') - self.tracing_enabled = self.tracing_enabled and not stop_tracing - statement = parsed.extract_orig() - self.perform_statement(statement) - self.tracing_enabled = tracing_was_enabled - - def perform_statement(self, statement): - - stmt = SimpleStatement(statement, consistency_level=self.consistency_level, serial_consistency_level=self.serial_consistency_level, fetch_size=self.page_size if self.use_paging else None) - success, future = self.perform_simple_statement(stmt) - - if future: - if future.warnings: - self.print_warnings(future.warnings) - - if self.tracing_enabled: - try: - for trace in future.get_all_query_traces(max_wait_per=self.max_trace_wait, query_cl=self.consistency_level): - print_trace(self, trace) - except TraceUnavailable: - msg = "Statement trace did not complete within %d seconds; trace data may be incomplete." % (self.session.max_trace_wait,) - self.writeresult(msg, color=RED) - for trace_id in future.get_query_trace_ids(): - self.show_session(trace_id, partial_session=True) - except Exception as err: - self.printerr("Unable to fetch query trace: %s" % (str(err),)) - - return success - - def parse_for_select_meta(self, query_string): - try: - parsed = cqlruleset.cql_parse(query_string)[1] - except IndexError: - return None - ks = self.cql_unprotect_name(parsed.get_binding('ksname', None)) - name = self.cql_unprotect_name(parsed.get_binding('cfname', None)) - try: - return self.get_table_meta(ks, name) - except ColumnFamilyNotFound: - try: - return self.get_view_meta(ks, name) - except MaterializedViewNotFound: - raise ObjectNotFound("'{}' not found in keyspace '{}'".format(name, ks)) - - def parse_for_update_meta(self, query_string): - try: - parsed = cqlruleset.cql_parse(query_string)[1] - except IndexError: - return None - ks = self.cql_unprotect_name(parsed.get_binding('ksname', None)) - cf = self.cql_unprotect_name(parsed.get_binding('cfname')) - return self.get_table_meta(ks, cf) - - def perform_simple_statement(self, statement): - if not statement: - return False, None - - future = self.session.execute_async(statement, trace=self.tracing_enabled) - result = None - try: - result = future.result() - except CQL_ERRORS as err: - err_msg = err.message if hasattr(err, 'message') else str(err) - self.printerr(str(err.__class__.__name__) + ": " + err_msg) - except Exception: - import traceback - self.printerr(traceback.format_exc()) - - # Even if statement failed we try to refresh schema if not agreed (see CASSANDRA-9689) - if not future.is_schema_agreed: - try: - self.conn.refresh_schema_metadata(5) # will throw exception if there is a schema mismatch - except Exception: - self.printerr("Warning: schema version mismatch detected; check the schema versions of your " - "nodes in system.local and system.peers.") - self.conn.refresh_schema_metadata(-1) - - if result is None: - return False, None - - if statement.query_string[:6].lower() == 'select': - self.print_result(result, self.parse_for_select_meta(statement.query_string)) - elif statement.query_string.lower().startswith("list users") or statement.query_string.lower().startswith("list roles"): - self.print_result(result, self.get_table_meta('system_auth', 'roles')) - elif statement.query_string.lower().startswith("list"): - self.print_result(result, self.get_table_meta('system_auth', 'role_permissions')) - elif result: - # CAS INSERT/UPDATE - self.writeresult("") - self.print_static_result(result, self.parse_for_update_meta(statement.query_string), with_header=True, tty=self.tty) - self.flush_output() - return True, future - - def print_result(self, result, table_meta): - self.decoding_errors = [] - - self.writeresult("") - - def print_all(result, table_meta, tty): - # Return the number of rows in total - num_rows = 0 - is_first = True - while True: - # Always print for the first page even it is empty - if result.current_rows or is_first: - with_header = is_first or tty - self.print_static_result(result, table_meta, with_header, tty, num_rows) - num_rows += len(result.current_rows) - if result.has_more_pages: - if self.shunted_query_out is None and tty: - # Only pause when not capturing. - input("---MORE---") - result.fetch_next_page() - else: - if not tty: - self.writeresult("") - break - is_first = False - return num_rows - - num_rows = print_all(result, table_meta, self.tty) - self.writeresult("(%d rows)" % num_rows) - - if self.decoding_errors: - for err in self.decoding_errors[:2]: - self.writeresult(err.message(), color=RED) - if len(self.decoding_errors) > 2: - self.writeresult('%d more decoding errors suppressed.' - % (len(self.decoding_errors) - 2), color=RED) - - def print_static_result(self, result, table_meta, with_header, tty, row_count_offset=0): - if not result.column_names and not table_meta: - return - - column_names = result.column_names or list(table_meta.columns.keys()) - formatted_names = [self.myformat_colname(name, table_meta) for name in column_names] - if not result.current_rows: - # print header only - self.print_formatted_result(formatted_names, None, with_header=True, tty=tty) - return - - cql_types = [] - if result.column_types: - ks_name = table_meta.keyspace_name if table_meta else self.current_keyspace - ks_meta = self.conn.metadata.keyspaces.get(ks_name, None) - cql_types = [CqlType(cql_typename(t), ks_meta) for t in result.column_types] - - formatted_values = [list(map(self.myformat_value, [row[c] for c in column_names], cql_types)) for row in result.current_rows] - - if self.expand_enabled: - self.print_formatted_result_vertically(formatted_names, formatted_values, row_count_offset) - else: - self.print_formatted_result(formatted_names, formatted_values, with_header, tty) - - def print_formatted_result(self, formatted_names, formatted_values, with_header, tty): - # determine column widths - widths = [n.displaywidth for n in formatted_names] - if formatted_values is not None: - for fmtrow in formatted_values: - for num, col in enumerate(fmtrow): - widths[num] = max(widths[num], col.displaywidth) - - # print header - if with_header: - header = ' | '.join(hdr.ljust(w, color=self.color) for (hdr, w) in zip(formatted_names, widths)) - self.writeresult(' ' + header.rstrip()) - self.writeresult('-%s-' % '-+-'.join('-' * w for w in widths)) - - # stop if there are no rows - if formatted_values is None: - self.writeresult("") - return - - # print row data - for row in formatted_values: - line = ' | '.join(col.rjust(w, color=self.color) for (col, w) in zip(row, widths)) - self.writeresult(' ' + line) - - if tty: - self.writeresult("") - - def print_formatted_result_vertically(self, formatted_names, formatted_values, row_count_offset): - max_col_width = max([n.displaywidth for n in formatted_names]) - max_val_width = max([n.displaywidth for row in formatted_values for n in row]) - - # for each row returned, list all the column-value pairs - for i, row in enumerate(formatted_values): - self.writeresult("@ Row %d" % (row_count_offset + i + 1)) - self.writeresult('-%s-' % '-+-'.join(['-' * max_col_width, '-' * max_val_width])) - for field_id, field in enumerate(row): - column = formatted_names[field_id].ljust(max_col_width, color=self.color) - value = field.ljust(field.displaywidth, color=self.color) - self.writeresult(' ' + " | ".join([column, value])) - self.writeresult('') - - def print_warnings(self, warnings): - if warnings is None or len(warnings) == 0: - return - - self.writeresult('') - self.writeresult('Warnings :') - for warning in warnings: - self.writeresult(warning) - self.writeresult('') - - def emptyline(self): - pass - - def parseline(self, line): - # this shouldn't be needed - raise NotImplementedError - - def complete(self, text, state): - if readline is None: - return - if state == 0: - try: - self.completion_matches = self.find_completions(text) - except Exception: - if debug_completion: - import traceback - traceback.print_exc() - else: - raise - try: - return self.completion_matches[state] - except IndexError: - return None - - def find_completions(self, text): - curline = readline.get_line_buffer() - prevlines = self.statement.getvalue() - wholestmt = prevlines + curline - begidx = readline.get_begidx() + len(prevlines) - stuff_to_complete = wholestmt[:begidx] - return cqlruleset.cql_complete(stuff_to_complete, text, cassandra_conn=self, - debug=debug_completion, startsymbol='cqlshCommand') - - def set_prompt(self, prompt, prepend_user=False): - if prepend_user and self.username: - self.prompt = "{0}@{1}".format(self.username, prompt) - return - self.prompt = prompt - - def cql_unprotect_name(self, namestr): - if namestr is None: - return - return cqlruleset.dequote_name(namestr) - - def cql_unprotect_value(self, valstr): - if valstr is not None: - return cqlruleset.dequote_value(valstr) - - def _columnize_unicode(self, name_list): - """ - Used when columnizing identifiers that may contain unicode - """ - names = [n for n in name_list] - cmd.Cmd.columnize(self, names) - print('') - - def do_describe(self, parsed): - - """ - DESCRIBE [cqlsh only] - - (DESC may be used as a shorthand.) - - Outputs information about the connected Cassandra cluster, or about - the data objects stored in the cluster. Use in one of the following ways: - - DESCRIBE KEYSPACES - - Output the names of all keyspaces. - - DESCRIBE KEYSPACE [] - - Output CQL commands that could be used to recreate the given keyspace, - and the objects in it (such as tables, types, functions, etc.). - In some cases, as the CQL interface matures, there will be some metadata - about a keyspace that is not representable with CQL. That metadata will not be shown. - The '' argument may be omitted, in which case the current - keyspace will be described. - - DESCRIBE TABLES - - Output the names of all tables in the current keyspace, or in all - keyspaces if there is no current keyspace. - - DESCRIBE TABLE [.] - - Output CQL commands that could be used to recreate the given table. - In some cases, as above, there may be table metadata which is not - representable and which will not be shown. - - DESCRIBE INDEX - - Output the CQL command that could be used to recreate the given index. - In some cases, there may be index metadata which is not representable - and which will not be shown. - - DESCRIBE MATERIALIZED VIEW - - Output the CQL command that could be used to recreate the given materialized view. - In some cases, there may be materialized view metadata which is not representable - and which will not be shown. - - DESCRIBE CLUSTER - - Output information about the connected Cassandra cluster, such as the - cluster name, and the partitioner and snitch in use. When you are - connected to a non-system keyspace, also shows endpoint-range - ownership information for the Cassandra ring. - - DESCRIBE [FULL] SCHEMA - - Output CQL commands that could be used to recreate the entire (non-system) schema. - Works as though "DESCRIBE KEYSPACE k" was invoked for each non-system keyspace - k. Use DESCRIBE FULL SCHEMA to include the system keyspaces. - - DESCRIBE TYPES - - Output the names of all user-defined-types in the current keyspace, or in all - keyspaces if there is no current keyspace. - - DESCRIBE TYPE [.] - - Output the CQL command that could be used to recreate the given user-defined-type. - - DESCRIBE FUNCTIONS - - Output the names of all user-defined-functions in the current keyspace, or in all - keyspaces if there is no current keyspace. - - DESCRIBE FUNCTION [.] - - Output the CQL command that could be used to recreate the given user-defined-function. - - DESCRIBE AGGREGATES - - Output the names of all user-defined-aggregates in the current keyspace, or in all - keyspaces if there is no current keyspace. - - DESCRIBE AGGREGATE [.] - - Output the CQL command that could be used to recreate the given user-defined-aggregate. - - DESCRIBE - - Output CQL commands that could be used to recreate the entire object schema, - where object can be either a keyspace or a table or an index or a materialized - view (in this order). - """ - stmt = SimpleStatement(parsed.extract_orig(), consistency_level=cassandra.ConsistencyLevel.LOCAL_ONE, fetch_size=self.page_size if self.use_paging else None) - future = self.session.execute_async(stmt) - - if self.connection_versions['build'][0] < '4': - print('\nWARN: DESCRIBE|DESC was moved to server side in Cassandra 4.0. As a consequence DESRIBE|DESC ' - 'will not work in cqlsh %r connected to Cassandra %r, the version that you are connected to. ' - 'DESCRIBE does not exist server side prior Cassandra 4.0.' - % (version, self.connection_versions['build'])) - else: - try: - result = future.result() - - what = parsed.matched[1][1].lower() - - if what in ('columnfamilies', 'tables', 'types', 'functions', 'aggregates'): - self.describe_list(result) - elif what == 'keyspaces': - self.describe_keyspaces(result) - elif what == 'cluster': - self.describe_cluster(result) - elif what: - self.describe_element(result) - - except CQL_ERRORS as err: - err_msg = err.message if hasattr(err, 'message') else str(err) - self.printerr(err_msg.partition("message=")[2].strip('"')) - except Exception: - import traceback - self.printerr(traceback.format_exc()) - - if future: - if future.warnings: - self.print_warnings(future.warnings) - - do_desc = do_describe - - def describe_keyspaces(self, rows): - """ - Print the output for a DESCRIBE KEYSPACES query - """ - names = [r['name'] for r in rows] - - print('') - cmd.Cmd.columnize(self, names) - print('') - - def describe_list(self, rows): - """ - Print the output for all the DESCRIBE queries for element names (e.g DESCRIBE TABLES, DESCRIBE FUNCTIONS ...) - """ - keyspace = None - names = list() - for row in rows: - if row['keyspace_name'] != keyspace: - if keyspace is not None: - self.print_keyspace_element_names(keyspace, names) - - keyspace = row['keyspace_name'] - names = list() - - names.append(str(row['name'])) - - if keyspace is not None: - self.print_keyspace_element_names(keyspace, names) - print('') - - def print_keyspace_element_names(self, keyspace, names): - print('') - if self.current_keyspace is None: - print('Keyspace %s' % (keyspace)) - print('---------%s' % ('-' * len(keyspace))) - cmd.Cmd.columnize(self, names) - - def describe_element(self, rows): - """ - Print the output for all the DESCRIBE queries where an element name as been specified (e.g DESCRIBE TABLE, DESCRIBE INDEX ...) - """ - for row in rows: - print('') - self.query_out.write(row['create_statement']) - print('') - - def describe_cluster(self, rows): - """ - Print the output for a DESCRIBE CLUSTER query. - - If a specified keyspace was in use the returned ResultSet will contains a 'range_ownership' column, - otherwise not. - """ - for row in rows: - print('\nCluster: %s' % row['cluster']) - print('Partitioner: %s' % row['partitioner']) - print('Snitch: %s\n' % row['snitch']) - if 'range_ownership' in row: - print("Range ownership:") - for entry in list(row['range_ownership'].items()): - print(' %39s [%s]' % (entry[0], ', '.join([host for host in entry[1]]))) - print('') - - def do_copy(self, parsed): - r""" - COPY [cqlsh only] - - COPY x FROM: Imports CSV data into a Cassandra table - COPY x TO: Exports data from a Cassandra table in CSV format. - - COPY [ ( column [, ...] ) ] - FROM ( '' | STDIN ) - [ WITH