From 11644d5bbd6fe53c2b2b46214688fe05d78de433 Mon Sep 17 00:00:00 2001 From: Marc Handalian Date: Tue, 30 Jan 2024 13:52:42 -0800 Subject: [PATCH 1/7] Remove compounding retries within PrimaryShardReplicationSource (#12043) This change removes retries within PrimaryShardReplicationSource and relies on retries in one place at the start of replication. This is done within SegmentReplicationTargetService's processLatestReceivedCheckpoint after a failure/success occurs. The timeout on these retries is the cause of flaky failures from SegmentReplication's bwc test within IndexingIT, that can occur on node disconnect. The retries will persist for over ~1m to the same primary node that has been relocated/shut down and cause the test to timeout. This change also includes simplifications to the cancellation flow on the target service before the shard is closed. Previously we "request" a cancel that does not remove the target from the ongoing replications collection until a cancellation failure is thrown. The transport calls from PrimaryShardReplicationSource are no longer wrapped in CancellableThreads by the client so a call to "cancel" will not throw. Instead we now immediately remove the target and decref/close it. Signed-off-by: Marc Handalian --- .../org/opensearch/upgrades/IndexingIT.java | 1 - .../replication/SegmentReplicationIT.java | 61 +++++++++++++++++++ ...plicationUsingRemoteStoreDisruptionIT.java | 53 +++++++--------- .../PrimaryShardReplicationSource.java | 46 ++++++-------- .../replication/SegmentReplicationTarget.java | 22 +++---- .../SegmentReplicationTargetService.java | 28 ++++++--- .../PrimaryShardReplicationSourceTests.java | 38 ------------ .../SegmentReplicationTargetServiceTests.java | 7 ++- 8 files changed, 133 insertions(+), 123 deletions(-) diff --git a/qa/rolling-upgrade/src/test/java/org/opensearch/upgrades/IndexingIT.java b/qa/rolling-upgrade/src/test/java/org/opensearch/upgrades/IndexingIT.java index f963f8d221bb5..1577260e145d4 100644 --- a/qa/rolling-upgrade/src/test/java/org/opensearch/upgrades/IndexingIT.java +++ b/qa/rolling-upgrade/src/test/java/org/opensearch/upgrades/IndexingIT.java @@ -262,7 +262,6 @@ public void testIndexing() throws Exception { * @throws Exception if index creation fail * @throws UnsupportedOperationException if cluster type is unknown */ - @AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/7679") public void testIndexingWithSegRep() throws Exception { if (UPGRADE_FROM_VERSION.before(Version.V_2_4_0)) { logger.info("--> Skip test for version {} where segment replication feature is not available", UPGRADE_FROM_VERSION); diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java index 5511bc7945d65..4a848e92800cb 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java @@ -594,6 +594,67 @@ public void testCancellation() throws Exception { assertDocCounts(docCount, primaryNode); } + public void testCancellationDuringGetCheckpointInfo() throws Exception { + cancelDuringReplicaAction(SegmentReplicationSourceService.Actions.GET_CHECKPOINT_INFO); + } + + public void testCancellationDuringGetSegments() throws Exception { + cancelDuringReplicaAction(SegmentReplicationSourceService.Actions.GET_SEGMENT_FILES); + } + + private void cancelDuringReplicaAction(String actionToblock) throws Exception { + // this test stubs transport calls specific to node-node replication. + assumeFalse( + "Skipping the test as its not compatible with segment replication with remote store.", + segmentReplicationWithRemoteEnabled() + ); + final String primaryNode = internalCluster().startDataOnlyNode(); + createIndex(INDEX_NAME, Settings.builder().put(indexSettings()).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1).build()); + ensureYellow(INDEX_NAME); + + final String replicaNode = internalCluster().startDataOnlyNode(); + ensureGreen(INDEX_NAME); + final SegmentReplicationTargetService targetService = internalCluster().getInstance( + SegmentReplicationTargetService.class, + replicaNode + ); + final IndexShard replicaShard = getIndexShard(replicaNode, INDEX_NAME); + CountDownLatch startCancellationLatch = new CountDownLatch(1); + CountDownLatch latch = new CountDownLatch(1); + + MockTransportService primaryTransportService = (MockTransportService) internalCluster().getInstance( + TransportService.class, + primaryNode + ); + primaryTransportService.addRequestHandlingBehavior(actionToblock, (handler, request, channel, task) -> { + logger.info("action {}", actionToblock); + try { + startCancellationLatch.countDown(); + latch.await(); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + }); + + // index a doc and trigger replication + client().prepareIndex(INDEX_NAME).setId("1").setSource("foo", "bar").setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE).get(); + + // remove the replica and ensure it is cleaned up. + startCancellationLatch.await(); + SegmentReplicationTarget target = targetService.get(replicaShard.shardId()); + assertAcked( + client().admin() + .indices() + .prepareUpdateSettings(INDEX_NAME) + .setSettings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)) + ); + assertEquals("Replication not closed: " + target.getId(), 0, target.refCount()); + assertEquals("Store has a positive refCount", 0, replicaShard.store().refCount()); + // stop the replica, this will do additional checks on shutDown to ensure the replica and its store are closed properly + internalCluster().stopRandomNode(InternalTestCluster.nameFilter(replicaNode)); + latch.countDown(); + } + public void testStartReplicaAfterPrimaryIndexesDocs() throws Exception { final String primaryNode = internalCluster().startDataOnlyNode(); createIndex(INDEX_NAME, Settings.builder().put(indexSettings()).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0).build()); diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/SegmentReplicationUsingRemoteStoreDisruptionIT.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/SegmentReplicationUsingRemoteStoreDisruptionIT.java index 8372135fc55c4..3d8d001b17ddf 100644 --- a/server/src/internalClusterTest/java/org/opensearch/remotestore/SegmentReplicationUsingRemoteStoreDisruptionIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/SegmentReplicationUsingRemoteStoreDisruptionIT.java @@ -23,8 +23,6 @@ import org.opensearch.indices.replication.SegmentReplicationState; import org.opensearch.indices.replication.SegmentReplicationTarget; import org.opensearch.indices.replication.SegmentReplicationTargetService; -import org.opensearch.indices.replication.common.ReplicationCollection; -import org.opensearch.test.InternalTestCluster; import org.opensearch.test.OpenSearchIntegTestCase; import org.opensearch.test.disruption.SlowClusterStateProcessing; @@ -33,6 +31,8 @@ import java.util.Set; import java.util.concurrent.TimeUnit; +import static org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked; + /** * This class runs tests with remote store + segRep while blocking file downloads */ @@ -59,22 +59,18 @@ public void testCancelReplicationWhileSyncingSegments() throws Exception { indexSingleDoc(); refresh(INDEX_NAME); waitForBlock(replicaNode, REPOSITORY_NAME, TimeValue.timeValueSeconds(10)); - final SegmentReplicationState state = targetService.getOngoingEventSegmentReplicationState(indexShard.shardId()); - assertEquals(SegmentReplicationState.Stage.GET_FILES, state.getStage()); - ReplicationCollection.ReplicationRef segmentReplicationTargetReplicationRef = targetService.get( - state.getReplicationId() - ); - final SegmentReplicationTarget segmentReplicationTarget = segmentReplicationTargetReplicationRef.get(); - // close the target ref here otherwise it will hold a refcount - segmentReplicationTargetReplicationRef.close(); + SegmentReplicationTarget segmentReplicationTarget = targetService.get(indexShard.shardId()); assertNotNull(segmentReplicationTarget); + assertEquals(SegmentReplicationState.Stage.GET_FILES, segmentReplicationTarget.state().getStage()); assertTrue(segmentReplicationTarget.refCount() > 0); - internalCluster().stopRandomNode(InternalTestCluster.nameFilter(primaryNode)); - assertBusy(() -> { - assertTrue(indexShard.routingEntry().primary()); - assertNull(targetService.getOngoingEventSegmentReplicationState(indexShard.shardId())); - assertEquals("Target should be closed", 0, segmentReplicationTarget.refCount()); - }); + assertAcked( + client().admin() + .indices() + .prepareUpdateSettings(INDEX_NAME) + .setSettings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)) + ); + assertNull(targetService.getOngoingEventSegmentReplicationState(indexShard.shardId())); + assertEquals("Target should be closed", 0, segmentReplicationTarget.refCount()); unblockNode(REPOSITORY_NAME, replicaNode); cleanupRepo(); } @@ -85,7 +81,6 @@ public void testCancelReplicationWhileFetchingMetadata() throws Exception { final Set dataNodeNames = internalCluster().getDataNodeNames(); final String replicaNode = getNode(dataNodeNames, false); - final String primaryNode = getNode(dataNodeNames, true); SegmentReplicationTargetService targetService = internalCluster().getInstance(SegmentReplicationTargetService.class, replicaNode); ensureGreen(INDEX_NAME); @@ -94,22 +89,18 @@ public void testCancelReplicationWhileFetchingMetadata() throws Exception { indexSingleDoc(); refresh(INDEX_NAME); waitForBlock(replicaNode, REPOSITORY_NAME, TimeValue.timeValueSeconds(10)); - final SegmentReplicationState state = targetService.getOngoingEventSegmentReplicationState(indexShard.shardId()); - assertEquals(SegmentReplicationState.Stage.GET_CHECKPOINT_INFO, state.getStage()); - ReplicationCollection.ReplicationRef segmentReplicationTargetReplicationRef = targetService.get( - state.getReplicationId() - ); - final SegmentReplicationTarget segmentReplicationTarget = segmentReplicationTargetReplicationRef.get(); - // close the target ref here otherwise it will hold a refcount - segmentReplicationTargetReplicationRef.close(); + SegmentReplicationTarget segmentReplicationTarget = targetService.get(indexShard.shardId()); assertNotNull(segmentReplicationTarget); + assertEquals(SegmentReplicationState.Stage.GET_CHECKPOINT_INFO, segmentReplicationTarget.state().getStage()); assertTrue(segmentReplicationTarget.refCount() > 0); - internalCluster().stopRandomNode(InternalTestCluster.nameFilter(primaryNode)); - assertBusy(() -> { - assertTrue(indexShard.routingEntry().primary()); - assertNull(targetService.getOngoingEventSegmentReplicationState(indexShard.shardId())); - assertEquals("Target should be closed", 0, segmentReplicationTarget.refCount()); - }); + assertAcked( + client().admin() + .indices() + .prepareUpdateSettings(INDEX_NAME) + .setSettings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)) + ); + assertNull(targetService.get(indexShard.shardId())); + assertEquals("Target should be closed", 0, segmentReplicationTarget.refCount()); unblockNode(REPOSITORY_NAME, replicaNode); cleanupRepo(); } diff --git a/server/src/main/java/org/opensearch/indices/replication/PrimaryShardReplicationSource.java b/server/src/main/java/org/opensearch/indices/replication/PrimaryShardReplicationSource.java index 02fc8feefd698..a17779810239a 100644 --- a/server/src/main/java/org/opensearch/indices/replication/PrimaryShardReplicationSource.java +++ b/server/src/main/java/org/opensearch/indices/replication/PrimaryShardReplicationSource.java @@ -8,16 +8,14 @@ package org.opensearch.indices.replication; -import org.apache.logging.log4j.LogManager; -import org.apache.logging.log4j.Logger; +import org.opensearch.action.ActionListenerResponseHandler; import org.opensearch.cluster.node.DiscoveryNode; import org.opensearch.core.action.ActionListener; -import org.opensearch.core.common.io.stream.Writeable; import org.opensearch.index.shard.IndexShard; import org.opensearch.index.store.StoreFileMetadata; import org.opensearch.indices.recovery.RecoverySettings; -import org.opensearch.indices.recovery.RetryableTransportClient; import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint; +import org.opensearch.threadpool.ThreadPool; import org.opensearch.transport.TransportRequestOptions; import org.opensearch.transport.TransportService; @@ -35,9 +33,7 @@ */ public class PrimaryShardReplicationSource implements SegmentReplicationSource { - private static final Logger logger = LogManager.getLogger(PrimaryShardReplicationSource.class); - - private final RetryableTransportClient transportClient; + private final TransportService transportService; private final DiscoveryNode sourceNode; private final DiscoveryNode targetNode; @@ -52,12 +48,7 @@ public PrimaryShardReplicationSource( DiscoveryNode sourceNode ) { this.targetAllocationId = targetAllocationId; - this.transportClient = new RetryableTransportClient( - transportService, - sourceNode, - recoverySettings.internalActionRetryTimeout(), - logger - ); + this.transportService = transportService; this.sourceNode = sourceNode; this.targetNode = targetNode; this.recoverySettings = recoverySettings; @@ -69,10 +60,14 @@ public void getCheckpointMetadata( ReplicationCheckpoint checkpoint, ActionListener listener ) { - final Writeable.Reader reader = CheckpointInfoResponse::new; - final ActionListener responseListener = ActionListener.map(listener, r -> r); final CheckpointInfoRequest request = new CheckpointInfoRequest(replicationId, targetAllocationId, targetNode, checkpoint); - transportClient.executeRetryableAction(GET_CHECKPOINT_INFO, request, responseListener, reader); + transportService.sendRequest( + sourceNode, + GET_CHECKPOINT_INFO, + request, + TransportRequestOptions.builder().withTimeout(recoverySettings.internalActionRetryTimeout()).build(), + new ActionListenerResponseHandler<>(listener, CheckpointInfoResponse::new, ThreadPool.Names.GENERIC) + ); } @Override @@ -88,8 +83,6 @@ public void getSegmentFiles( // MultiFileWriter takes care of progress tracking for downloads in this scenario // TODO: Move state management and tracking into replication methods and use chunking and data // copy mechanisms only from MultiFileWriter - final Writeable.Reader reader = GetSegmentFilesResponse::new; - final ActionListener responseListener = ActionListener.map(listener, r -> r); final GetSegmentFilesRequest request = new GetSegmentFilesRequest( replicationId, targetAllocationId, @@ -97,20 +90,17 @@ public void getSegmentFiles( filesToFetch, checkpoint ); - final TransportRequestOptions options = TransportRequestOptions.builder() - .withTimeout(recoverySettings.internalActionLongTimeout()) - .build(); - transportClient.executeRetryableAction(GET_SEGMENT_FILES, request, options, responseListener, reader); + transportService.sendRequest( + sourceNode, + GET_SEGMENT_FILES, + request, + TransportRequestOptions.builder().withTimeout(recoverySettings.internalActionLongTimeout()).build(), + new ActionListenerResponseHandler<>(listener, GetSegmentFilesResponse::new, ThreadPool.Names.GENERIC) + ); } @Override public String getDescription() { return sourceNode.getName(); } - - @Override - public void cancel() { - transportClient.cancel(); - } - } diff --git a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java index cc71ef816e525..af764556b7549 100644 --- a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java +++ b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java @@ -83,6 +83,16 @@ protected void closeInternal() { } } + @Override + protected void onCancel(String reason) { + try { + notifyListener(new ReplicationFailedException(reason), false); + } finally { + source.cancel(); + cancellableThreads.cancel(reason); + } + } + @Override protected String getPrefix() { return REPLICATION_PREFIX + UUIDs.randomBase64UUID() + "."; @@ -320,16 +330,4 @@ private void finalizeReplication(CheckpointInfoResponse checkpointInfoResponse) } } } - - /** - * Trigger a cancellation, this method will not close the target a subsequent call to #fail is required from target service. - */ - @Override - public void cancel(String reason) { - if (finished.get() == false) { - logger.trace(new ParameterizedMessage("Cancelling replication for target {}", description())); - cancellableThreads.cancel(reason); - source.cancel(); - } - } } diff --git a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java index d6db154a4e0e3..f28f829545d59 100644 --- a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java +++ b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java @@ -84,10 +84,6 @@ public class SegmentReplicationTargetService extends AbstractLifecycleComponent private final ClusterService clusterService; private final TransportService transportService; - public ReplicationRef get(long replicationId) { - return onGoingReplications.get(replicationId); - } - /** * The internal actions * @@ -158,6 +154,7 @@ protected void doStart() { @Override protected void doStop() { if (DiscoveryNode.isDataNode(clusterService.getSettings())) { + assert onGoingReplications.size() == 0 : "Replication collection should be empty on shutdown"; clusterService.removeListener(this); } } @@ -201,7 +198,7 @@ public void clusterChanged(ClusterChangedEvent event) { @Override public void beforeIndexShardClosed(ShardId shardId, @Nullable IndexShard indexShard, Settings indexSettings) { if (indexShard != null && indexShard.indexSettings().isSegRepEnabled()) { - onGoingReplications.requestCancel(indexShard.shardId(), "Shard closing"); + onGoingReplications.cancelForShard(indexShard.shardId(), "Shard closing"); latestReceivedCheckpoint.remove(shardId); } } @@ -223,7 +220,7 @@ public void afterIndexShardStarted(IndexShard indexShard) { @Override public void shardRoutingChanged(IndexShard indexShard, @Nullable ShardRouting oldRouting, ShardRouting newRouting) { if (oldRouting != null && indexShard.indexSettings().isSegRepEnabled() && oldRouting.primary() == false && newRouting.primary()) { - onGoingReplications.requestCancel(indexShard.shardId(), "Shard has been promoted to primary"); + onGoingReplications.cancelForShard(indexShard.shardId(), "Shard has been promoted to primary"); latestReceivedCheckpoint.remove(indexShard.shardId()); } } @@ -255,6 +252,14 @@ public SegmentReplicationState getSegmentReplicationState(ShardId shardId) { .orElseGet(() -> getlatestCompletedEventSegmentReplicationState(shardId)); } + public ReplicationRef get(long replicationId) { + return onGoingReplications.get(replicationId); + } + + public SegmentReplicationTarget get(ShardId shardId) { + return onGoingReplications.getOngoingReplicationTarget(shardId); + } + /** * Invoked when a new checkpoint is received from a primary shard. * It checks if a new checkpoint should be processed or not and starts replication if needed. @@ -454,7 +459,13 @@ protected boolean processLatestReceivedCheckpoint(IndexShard replicaShard, Threa latestPublishedCheckpoint ) ); - Runnable runnable = () -> onNewCheckpoint(latestReceivedCheckpoint.get(replicaShard.shardId()), replicaShard); + Runnable runnable = () -> { + // if we retry ensure the shard is not in the process of being closed. + // it will be removed from indexService's collection before the shard is actually marked as closed. + if (indicesService.getShardOrNull(replicaShard.shardId()) != null) { + onNewCheckpoint(latestReceivedCheckpoint.get(replicaShard.shardId()), replicaShard); + } + }; // Checks if we are using same thread and forks if necessary. if (thread == Thread.currentThread()) { threadPool.generic().execute(runnable); @@ -548,9 +559,6 @@ public ReplicationRunner(long replicationId) { @Override public void onFailure(Exception e) { - try (final ReplicationRef ref = onGoingReplications.get(replicationId)) { - logger.error(() -> new ParameterizedMessage("Error during segment replication, {}", ref.get().description()), e); - } onGoingReplications.fail(replicationId, new ReplicationFailedException("Unexpected Error during replication", e), false); } diff --git a/server/src/test/java/org/opensearch/indices/replication/PrimaryShardReplicationSourceTests.java b/server/src/test/java/org/opensearch/indices/replication/PrimaryShardReplicationSourceTests.java index e4dd32e5c6f70..2531790ede4af 100644 --- a/server/src/test/java/org/opensearch/indices/replication/PrimaryShardReplicationSourceTests.java +++ b/server/src/test/java/org/opensearch/indices/replication/PrimaryShardReplicationSourceTests.java @@ -15,7 +15,6 @@ import org.opensearch.cluster.service.ClusterService; import org.opensearch.common.settings.ClusterSettings; import org.opensearch.common.settings.Settings; -import org.opensearch.common.util.CancellableThreads; import org.opensearch.common.util.io.IOUtils; import org.opensearch.core.action.ActionListener; import org.opensearch.index.shard.IndexShard; @@ -27,12 +26,9 @@ import org.opensearch.test.ClusterServiceUtils; import org.opensearch.test.transport.CapturingTransport; import org.opensearch.transport.TransportService; -import org.junit.Assert; import java.util.Arrays; import java.util.Collections; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.TimeUnit; import static org.mockito.Mockito.mock; @@ -165,40 +161,6 @@ public void testTransportTimeoutForGetSegmentFilesAction() { assertEquals(recoverySettings.internalActionLongTimeout(), capturedRequest.options.timeout()); } - public void testGetSegmentFiles_CancelWhileRequestOpen() throws InterruptedException { - CountDownLatch latch = new CountDownLatch(1); - final ReplicationCheckpoint checkpoint = new ReplicationCheckpoint( - indexShard.shardId(), - PRIMARY_TERM, - SEGMENTS_GEN, - VERSION, - Codec.getDefault().getName() - ); - StoreFileMetadata testMetadata = new StoreFileMetadata("testFile", 1L, "checksum", Version.LATEST); - replicationSource.getSegmentFiles( - REPLICATION_ID, - checkpoint, - Arrays.asList(testMetadata), - mock(IndexShard.class), - (fileName, bytesRecovered) -> {}, - new ActionListener<>() { - @Override - public void onResponse(GetSegmentFilesResponse getSegmentFilesResponse) { - Assert.fail("onFailure response expected."); - } - - @Override - public void onFailure(Exception e) { - assertEquals(e.getClass(), CancellableThreads.ExecutionCancelledException.class); - latch.countDown(); - } - } - ); - replicationSource.cancel(); - latch.await(2, TimeUnit.SECONDS); - assertEquals("listener should have resolved in a failure", 0, latch.getCount()); - } - private DiscoveryNode newDiscoveryNode(String nodeName) { return new DiscoveryNode( nodeName, diff --git a/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetServiceTests.java b/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetServiceTests.java index f284a425a417b..3c72dda2d8b5d 100644 --- a/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetServiceTests.java +++ b/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetServiceTests.java @@ -620,6 +620,7 @@ public void testForceSegmentSyncHandlerWithFailure_AlreadyClosedException_swallo } public void testTargetCancelledBeforeStartInvoked() { + final String cancelReason = "test"; final SegmentReplicationTarget target = new SegmentReplicationTarget( replicaShard, primaryShard.getLatestReplicationCheckpoint(), @@ -633,12 +634,12 @@ public void onReplicationDone(SegmentReplicationState state) { @Override public void onReplicationFailure(SegmentReplicationState state, ReplicationFailedException e, boolean sendShardFailure) { // failures leave state object in last entered stage. - assertEquals(SegmentReplicationState.Stage.GET_CHECKPOINT_INFO, state.getStage()); - assertTrue(e.getCause() instanceof CancellableThreads.ExecutionCancelledException); + assertEquals(SegmentReplicationState.Stage.INIT, state.getStage()); + assertEquals(cancelReason, e.getMessage()); } } ); - target.cancel("test"); + target.cancel(cancelReason); sut.startReplication(target); } From 7526f8608adf5aa468b75db7ea99d3ab72a468fa Mon Sep 17 00:00:00 2001 From: Thomas Seidl Date: Tue, 30 Jan 2024 23:37:09 +0100 Subject: [PATCH 2/7] Fixed typo in DEVELOPER_GUIDE.md. (#12051) Signed-off-by: Thomas Seidl --- DEVELOPER_GUIDE.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/DEVELOPER_GUIDE.md b/DEVELOPER_GUIDE.md index 21adbb0305ab1..f0851fc58d444 100644 --- a/DEVELOPER_GUIDE.md +++ b/DEVELOPER_GUIDE.md @@ -348,7 +348,7 @@ Please follow these formatting guidelines: * Wildcard imports (`import foo.bar.baz.*`) are forbidden and will cause the build to fail. * If *absolutely* necessary, you can disable formatting for regions of code with the `// tag::NAME` and `// end::NAME` directives, but note that these are intended for use in documentation, so please make it clear what you have done, and only do this where the benefit clearly outweighs the decrease in consistency. * Note that JavaDoc and block comments i.e. `/* ... */` are not formatted, but line comments i.e `// ...` are. -* There is an implicit rule that negative boolean expressions should use the form `foo == false` instead of `!foo` for better readability of the code. While this isn't strictly enforced, if might get called out in PR reviews as something to change. +* There is an implicit rule that negative boolean expressions should use the form `foo == false` instead of `!foo` for better readability of the code. While this isn't strictly enforced, it might get called out in PR reviews as something to change. ## Adding Dependencies From 2f8d267adbdfef175004912e877bdbd0814f27f1 Mon Sep 17 00:00:00 2001 From: Poojita Raj Date: Tue, 30 Jan 2024 16:15:41 -0800 Subject: [PATCH 3/7] Fix flaky test SegmentReplicationStatsIT.testMultipleIndices (#12070) Signed-off-by: Poojita Raj --- .../SegmentReplicationStatsIT.java | 61 ++++++++++--------- 1 file changed, 32 insertions(+), 29 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationStatsIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationStatsIT.java index 766471fdc0756..89aef6f0be1a6 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationStatsIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationStatsIT.java @@ -268,12 +268,12 @@ public void testMultipleIndices() throws Exception { internalCluster().startClusterManagerOnlyNode(); final String index_2 = "tst-index-2"; List nodes = new ArrayList<>(); - final String primaryNode = internalCluster().startNode(); + final String primaryNode = internalCluster().startDataOnlyNode(); nodes.add(primaryNode); createIndex(INDEX_NAME, index_2); ensureYellowAndNoInitializingShards(INDEX_NAME, index_2); - nodes.add(internalCluster().startNode()); + nodes.add(internalCluster().startDataOnlyNode()); ensureGreen(INDEX_NAME, index_2); final long numDocs = scaledRandomIntBetween(50, 100); @@ -284,6 +284,7 @@ public void testMultipleIndices() throws Exception { refresh(INDEX_NAME, index_2); waitForSearchableDocs(INDEX_NAME, numDocs, nodes); waitForSearchableDocs(index_2, numDocs, nodes); + ensureSearchable(INDEX_NAME, index_2); final IndexShard index_1_primary = getIndexShard(primaryNode, INDEX_NAME); final IndexShard index_2_primary = getIndexShard(primaryNode, index_2); @@ -291,37 +292,39 @@ public void testMultipleIndices() throws Exception { assertTrue(index_1_primary.routingEntry().primary()); assertTrue(index_2_primary.routingEntry().primary()); - // test both indices are returned in the response. - SegmentReplicationStatsResponse segmentReplicationStatsResponse = client().admin() - .indices() - .prepareSegmentReplicationStats() - .execute() - .actionGet(); + assertBusy(() -> { + // test both indices are returned in the response. + SegmentReplicationStatsResponse segmentReplicationStatsResponse = dataNodeClient().admin() + .indices() + .prepareSegmentReplicationStats() + .execute() + .actionGet(); - Map> replicationStats = segmentReplicationStatsResponse.getReplicationStats(); - assertEquals(2, replicationStats.size()); - List replicationPerGroupStats = replicationStats.get(INDEX_NAME); - assertEquals(1, replicationPerGroupStats.size()); - SegmentReplicationPerGroupStats perGroupStats = replicationPerGroupStats.get(0); - assertEquals(perGroupStats.getShardId(), index_1_primary.shardId()); - Set replicaStats = perGroupStats.getReplicaStats(); - assertEquals(1, replicaStats.size()); - for (SegmentReplicationShardStats replica : replicaStats) { - assertNotNull(replica.getCurrentReplicationState()); - } + Map> replicationStats = segmentReplicationStatsResponse.getReplicationStats(); + assertEquals(2, replicationStats.size()); + List replicationPerGroupStats = replicationStats.get(INDEX_NAME); + assertEquals(1, replicationPerGroupStats.size()); + SegmentReplicationPerGroupStats perGroupStats = replicationPerGroupStats.get(0); + assertEquals(perGroupStats.getShardId(), index_1_primary.shardId()); + Set replicaStats = perGroupStats.getReplicaStats(); + assertEquals(1, replicaStats.size()); + for (SegmentReplicationShardStats replica : replicaStats) { + assertNotNull(replica.getCurrentReplicationState()); + } - replicationPerGroupStats = replicationStats.get(index_2); - assertEquals(1, replicationPerGroupStats.size()); - perGroupStats = replicationPerGroupStats.get(0); - assertEquals(perGroupStats.getShardId(), index_2_primary.shardId()); - replicaStats = perGroupStats.getReplicaStats(); - assertEquals(1, replicaStats.size()); - for (SegmentReplicationShardStats replica : replicaStats) { - assertNotNull(replica.getCurrentReplicationState()); - } + replicationPerGroupStats = replicationStats.get(index_2); + assertEquals(1, replicationPerGroupStats.size()); + perGroupStats = replicationPerGroupStats.get(0); + assertEquals(perGroupStats.getShardId(), index_2_primary.shardId()); + replicaStats = perGroupStats.getReplicaStats(); + assertEquals(1, replicaStats.size()); + for (SegmentReplicationShardStats replica : replicaStats) { + assertNotNull(replica.getCurrentReplicationState()); + } + }, 30, TimeUnit.SECONDS); // test only single index queried. - segmentReplicationStatsResponse = client().admin() + SegmentReplicationStatsResponse segmentReplicationStatsResponse = dataNodeClient().admin() .indices() .prepareSegmentReplicationStats() .setIndices(index_2) From c6cebc7a5ab0de954ab7c70f2c5415f888e23f84 Mon Sep 17 00:00:00 2001 From: Ashish Date: Wed, 31 Jan 2024 16:44:19 +0530 Subject: [PATCH 4/7] Fix flaky RemoteIndexRecoveryIT testRerouteRecovery test #9580 (#11918) Signed-off-by: Ashish Singh --- .../indices/recovery/IndexRecoveryIT.java | 4 +-- .../remotestore/RemoteIndexRecoveryIT.java | 6 ---- .../opensearch/test/OpenSearchTestCase.java | 33 +++++++++++++++++++ 3 files changed, 35 insertions(+), 8 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/recovery/IndexRecoveryIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/recovery/IndexRecoveryIT.java index e4f1f8717f899..72e680e22ed75 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/recovery/IndexRecoveryIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/recovery/IndexRecoveryIT.java @@ -523,12 +523,12 @@ public void testRerouteRecovery() throws Exception { logger.info("--> waiting for recovery to start both on source and target"); final Index index = resolveIndex(INDEX_NAME); - assertBusy(() -> { + assertBusyWithFixedSleepTime(() -> { IndicesService indicesService = internalCluster().getInstance(IndicesService.class, nodeA); assertThat(indicesService.indexServiceSafe(index).getShard(0).recoveryStats().currentAsSource(), equalTo(1)); indicesService = internalCluster().getInstance(IndicesService.class, nodeB); assertThat(indicesService.indexServiceSafe(index).getShard(0).recoveryStats().currentAsTarget(), equalTo(1)); - }); + }, TimeValue.timeValueSeconds(10), TimeValue.timeValueMillis(500)); logger.info("--> request recoveries"); RecoveryResponse response = client().admin().indices().prepareRecoveries(INDEX_NAME).execute().actionGet(); diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteIndexRecoveryIT.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteIndexRecoveryIT.java index c957f1b338bfe..6de61cf203c60 100644 --- a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteIndexRecoveryIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteIndexRecoveryIT.java @@ -157,10 +157,4 @@ public void testDisconnectsDuringRecovery() { public void testReplicaRecovery() { } - - @AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/9580") - public void testRerouteRecovery() { - - } - } diff --git a/test/framework/src/main/java/org/opensearch/test/OpenSearchTestCase.java b/test/framework/src/main/java/org/opensearch/test/OpenSearchTestCase.java index b5ff30deecf5c..96bffcf2d3692 100644 --- a/test/framework/src/main/java/org/opensearch/test/OpenSearchTestCase.java +++ b/test/framework/src/main/java/org/opensearch/test/OpenSearchTestCase.java @@ -83,6 +83,7 @@ import org.opensearch.common.settings.Settings; import org.opensearch.common.time.DateUtils; import org.opensearch.common.time.FormatNames; +import org.opensearch.common.unit.TimeValue; import org.opensearch.common.util.MockBigArrays; import org.opensearch.common.util.MockPageCacheRecycler; import org.opensearch.common.util.concurrent.ThreadContext; @@ -1095,6 +1096,38 @@ public static void assertBusy(CheckedRunnable codeBlock, long maxWait } } + /** + * Runs the code block for the provided max wait time and sleeping for fixed sleep time, waiting for no assertions to trip. + */ + public static void assertBusyWithFixedSleepTime(CheckedRunnable codeBlock, TimeValue maxWaitTime, TimeValue sleepTime) + throws Exception { + long maxTimeInMillis = maxWaitTime.millis(); + long sleepTimeInMillis = sleepTime.millis(); + if (sleepTimeInMillis > maxTimeInMillis) { + throw new IllegalArgumentException("sleepTime is more than the maxWaitTime"); + } + long sum = 0; + List failures = new ArrayList<>(); + while (sum <= maxTimeInMillis) { + try { + codeBlock.run(); + return; + } catch (AssertionError e) { + failures.add(e); + } + sum += sleepTimeInMillis; + Thread.sleep(sleepTimeInMillis); + } + try { + codeBlock.run(); + } catch (AssertionError e) { + for (AssertionError failure : failures) { + e.addSuppressed(failure); + } + throw e; + } + } + /** * Periodically execute the supplied function until it returns true, or a timeout * is reached. This version uses a timeout of 10 seconds. If at all possible, From bf5e628d82adf038f618d894a309d8dd57655fcd Mon Sep 17 00:00:00 2001 From: Varun Bansal Date: Wed, 31 Jan 2024 21:30:13 +0530 Subject: [PATCH 5/7] fix flaky test - handle IllegalStateException and throw AssertionError to use assertbusy (#11981) * fix flaky test - handle IllegalStateException and throw AssertionError to use assertbusy Signed-off-by: bansvaru * Empty Commit to retry the build Signed-off-by: bansvaru --------- Signed-off-by: bansvaru --- .../RemoteStoreClusterStateRestoreIT.java | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreClusterStateRestoreIT.java b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreClusterStateRestoreIT.java index c61e2ec6e4f6c..3f90732f1f13d 100644 --- a/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreClusterStateRestoreIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/remotestore/RemoteStoreClusterStateRestoreIT.java @@ -310,10 +310,16 @@ private void validateCurrentMetadata() throws Exception { internalCluster().getClusterManagerName() ); assertBusy(() -> { - ClusterMetadataManifest manifest = remoteClusterStateService.getLatestClusterMetadataManifest( - getClusterState().getClusterName().value(), - getClusterState().metadata().clusterUUID() - ).get(); + ClusterMetadataManifest manifest; + try { + manifest = remoteClusterStateService.getLatestClusterMetadataManifest( + getClusterState().getClusterName().value(), + getClusterState().metadata().clusterUUID() + ).get(); + } catch (IllegalStateException e) { + // AssertionError helps us use assertBusy and retry validation if failed due to a race condition. + throw new AssertionError("Error while validating latest cluster metadata", e); + } ClusterState clusterState = getClusterState(); Metadata currentMetadata = clusterState.metadata(); assertEquals(currentMetadata.indices().size(), manifest.getIndices().size()); From cfcb128a641bb17e3a3d3f649ff087f7bd138595 Mon Sep 17 00:00:00 2001 From: Sarthak Aggarwal Date: Wed, 31 Jan 2024 22:46:39 +0530 Subject: [PATCH 6/7] fix flaky 11610 (#12049) Signed-off-by: Sarthak Aggarwal --- .../java/org/opensearch/indices/state/CloseIndexIT.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/state/CloseIndexIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/state/CloseIndexIT.java index 547f9e7a8d380..87e5df8c48981 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/state/CloseIndexIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/state/CloseIndexIT.java @@ -287,7 +287,7 @@ public void testCloseWhileDeletingIndices() throws Exception { throw new AssertionError(e); } try { - assertAcked(client().admin().indices().prepareDelete(indexToDelete)); + assertAcked(client().admin().indices().prepareDelete(indexToDelete).setTimeout("60s")); } catch (final Exception e) { assertException(e, indexToDelete); } @@ -301,7 +301,7 @@ public void testCloseWhileDeletingIndices() throws Exception { throw new AssertionError(e); } try { - client().admin().indices().prepareClose(indexToClose).get(); + client().admin().indices().prepareClose(indexToClose).setTimeout("60s").get(); } catch (final Exception e) { assertException(e, indexToClose); } From 16c52577d4f9a2544614d119e0cb6c6fc0d2afa5 Mon Sep 17 00:00:00 2001 From: Sarat Vemulapalli Date: Wed, 31 Jan 2024 12:57:33 -0800 Subject: [PATCH 7/7] Add additional logging for Azure repository stats (#12079) * Adding logging for AzureStats Signed-off-by: Sarat Vemulapalli * Debugging Signed-off-by: Sarat Vemulapalli * Removing debugging lines Signed-off-by: Sarat Vemulapalli * Addressing comments Signed-off-by: Sarat Vemulapalli * Addressing feedback 2 Signed-off-by: Sarat Vemulapalli --------- Signed-off-by: Sarat Vemulapalli Signed-off-by: Sarat Vemulapalli --- .../repositories/azure/AzureBlobStoreRepositoryTests.java | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/plugins/repository-azure/src/internalClusterTest/java/org/opensearch/repositories/azure/AzureBlobStoreRepositoryTests.java b/plugins/repository-azure/src/internalClusterTest/java/org/opensearch/repositories/azure/AzureBlobStoreRepositoryTests.java index 986720ec431fe..1ba16422c9214 100644 --- a/plugins/repository-azure/src/internalClusterTest/java/org/opensearch/repositories/azure/AzureBlobStoreRepositoryTests.java +++ b/plugins/repository-azure/src/internalClusterTest/java/org/opensearch/repositories/azure/AzureBlobStoreRepositoryTests.java @@ -39,6 +39,8 @@ import com.azure.storage.common.implementation.Constants; import com.azure.storage.common.policy.RequestRetryOptions; import com.azure.storage.common.policy.RetryPolicyType; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import org.opensearch.common.SuppressForbidden; import org.opensearch.common.regex.Regex; import org.opensearch.common.settings.MockSecureSettings; @@ -188,6 +190,7 @@ protected String requestUniqueId(final HttpExchange exchange) { @SuppressForbidden(reason = "this test uses a HttpServer to emulate an Azure endpoint") private static class AzureHTTPStatsCollectorHandler extends HttpStatsCollectorHandler { + private static final Logger testLogger = LogManager.getLogger(AzureHTTPStatsCollectorHandler.class); private static final Pattern listPattern = Pattern.compile("GET /[a-zA-Z0-9]+\\??.+"); private static final Pattern getPattern = Pattern.compile("GET /[^?/]+/[^?/]+\\??.*"); @@ -197,6 +200,7 @@ private AzureHTTPStatsCollectorHandler(HttpHandler delegate) { @Override protected void maybeTrack(String request, Headers headers) { + testLogger.info(request, headers); if (getPattern.matcher(request).matches()) { trackRequest("GetBlob"); } else if (Regex.simpleMatch("HEAD /*/*", request)) {