use of org.opensearch.cluster.coordination.FailedToCommitClusterStateException in project OpenSearch by opensearch-project.
the class MasterServiceTests method testAcking.
public void testAcking() throws InterruptedException {
final DiscoveryNode node1 = new DiscoveryNode("node1", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
final DiscoveryNode node2 = new DiscoveryNode("node2", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
final DiscoveryNode node3 = new DiscoveryNode("node3", buildNewFakeTransportAddress(), emptyMap(), emptySet(), Version.CURRENT);
try (MasterService masterService = new MasterService(Settings.builder().put(ClusterName.CLUSTER_NAME_SETTING.getKey(), MasterServiceTests.class.getSimpleName()).put(Node.NODE_NAME_SETTING.getKey(), "test_node").build(), new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS), threadPool)) {
final ClusterState initialClusterState = ClusterState.builder(new ClusterName(MasterServiceTests.class.getSimpleName())).nodes(DiscoveryNodes.builder().add(node1).add(node2).add(node3).localNodeId(node1.getId()).masterNodeId(node1.getId())).blocks(ClusterBlocks.EMPTY_CLUSTER_BLOCK).build();
final AtomicReference<ClusterStatePublisher> publisherRef = new AtomicReference<>();
masterService.setClusterStatePublisher((e, pl, al) -> publisherRef.get().publish(e, pl, al));
masterService.setClusterStateSupplier(() -> initialClusterState);
masterService.start();
// check that we don't time out before even committing the cluster state
{
final CountDownLatch latch = new CountDownLatch(1);
publisherRef.set((clusterChangedEvent, publishListener, ackListener) -> publishListener.onFailure(new FailedToCommitClusterStateException("mock exception")));
masterService.submitStateUpdateTask("test2", new AckedClusterStateUpdateTask<Void>(null, null) {
@Override
public ClusterState execute(ClusterState currentState) {
return ClusterState.builder(currentState).build();
}
@Override
public TimeValue ackTimeout() {
return TimeValue.ZERO;
}
@Override
public TimeValue timeout() {
return null;
}
@Override
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
fail();
}
@Override
protected Void newResponse(boolean acknowledged) {
fail();
return null;
}
@Override
public void onFailure(String source, Exception e) {
latch.countDown();
}
@Override
public void onAckTimeout() {
fail();
}
});
latch.await();
}
// check that we timeout if commit took too long
{
final CountDownLatch latch = new CountDownLatch(2);
final TimeValue ackTimeout = TimeValue.timeValueMillis(randomInt(100));
publisherRef.set((clusterChangedEvent, publishListener, ackListener) -> {
publishListener.onResponse(null);
ackListener.onCommit(TimeValue.timeValueMillis(ackTimeout.millis() + randomInt(100)));
ackListener.onNodeAck(node1, null);
ackListener.onNodeAck(node2, null);
ackListener.onNodeAck(node3, null);
});
masterService.submitStateUpdateTask("test2", new AckedClusterStateUpdateTask<Void>(null, null) {
@Override
public ClusterState execute(ClusterState currentState) {
return ClusterState.builder(currentState).build();
}
@Override
public TimeValue ackTimeout() {
return ackTimeout;
}
@Override
public TimeValue timeout() {
return null;
}
@Override
public void clusterStateProcessed(String source, ClusterState oldState, ClusterState newState) {
latch.countDown();
}
@Override
protected Void newResponse(boolean acknowledged) {
fail();
return null;
}
@Override
public void onFailure(String source, Exception e) {
fail();
}
@Override
public void onAckTimeout() {
latch.countDown();
}
});
latch.await();
}
}
}
use of org.opensearch.cluster.coordination.FailedToCommitClusterStateException in project OpenSearch by opensearch-project.
the class BatchedRerouteServiceTests method testNotifiesOnFailure.
public void testNotifiesOnFailure() throws InterruptedException {
final BatchedRerouteService batchedRerouteService = new BatchedRerouteService(clusterService, (s, r) -> {
if (rarely()) {
throw new OpenSearchException("simulated");
}
return randomBoolean() ? s : ClusterState.builder(s).build();
});
final int iterations = between(1, 100);
final CountDownLatch countDownLatch = new CountDownLatch(iterations);
for (int i = 0; i < iterations; i++) {
batchedRerouteService.reroute("iteration " + i, randomFrom(EnumSet.allOf(Priority.class)), ActionListener.wrap(r -> {
countDownLatch.countDown();
if (rarely()) {
throw new OpenSearchException("failure during notification");
}
}, e -> {
countDownLatch.countDown();
if (randomBoolean()) {
throw new OpenSearchException("failure during failure notification", e);
}
}));
if (rarely()) {
clusterService.getMasterService().setClusterStatePublisher(randomBoolean() ? ClusterServiceUtils.createClusterStatePublisher(clusterService.getClusterApplierService()) : (event, publishListener, ackListener) -> publishListener.onFailure(new FailedToCommitClusterStateException("simulated")));
}
if (rarely()) {
clusterService.getClusterApplierService().onNewClusterState("simulated", () -> {
ClusterState state = clusterService.state();
return ClusterState.builder(state).nodes(DiscoveryNodes.builder(state.nodes()).masterNodeId(randomBoolean() ? null : state.nodes().getLocalNodeId())).build();
}, (source, e) -> {
});
}
}
// i.e. it doesn't leak any listeners
assertTrue(countDownLatch.await(10, TimeUnit.SECONDS));
}
use of org.opensearch.cluster.coordination.FailedToCommitClusterStateException in project OpenSearch by opensearch-project.
the class TransportMasterNodeActionTests method testMasterFailoverAfterStepDown.
public void testMasterFailoverAfterStepDown() throws ExecutionException, InterruptedException {
Request request = new Request().masterNodeTimeout(TimeValue.timeValueHours(1));
PlainActionFuture<Response> listener = new PlainActionFuture<>();
final Response response = new Response();
setState(clusterService, ClusterStateCreationUtils.state(localNode, localNode, allNodes));
new Action("internal:testAction", transportService, clusterService, threadPool) {
@Override
protected void masterOperation(Request request, ClusterState state, ActionListener<Response> listener) throws Exception {
// The other node has become master, simulate failures of this node while publishing cluster state through ZenDiscovery
setState(clusterService, ClusterStateCreationUtils.state(localNode, remoteNode, allNodes));
Exception failure = randomBoolean() ? new FailedToCommitClusterStateException("Fake error") : new NotMasterException("Fake error");
listener.onFailure(failure);
}
}.execute(request, listener);
assertThat(transport.capturedRequests().length, equalTo(1));
CapturingTransport.CapturedRequest capturedRequest = transport.capturedRequests()[0];
assertTrue(capturedRequest.node.isMasterNode());
assertThat(capturedRequest.request, equalTo(request));
assertThat(capturedRequest.action, equalTo("internal:testAction"));
transport.handleResponse(capturedRequest.requestId, response);
assertTrue(listener.isDone());
assertThat(listener.get(), equalTo(response));
}
use of org.opensearch.cluster.coordination.FailedToCommitClusterStateException in project OpenSearch by opensearch-project.
the class MasterService method onPublicationFailed.
void onPublicationFailed(ClusterChangedEvent clusterChangedEvent, TaskOutputs taskOutputs, long startTimeMillis, Exception exception) {
if (exception instanceof FailedToCommitClusterStateException) {
final long version = clusterChangedEvent.state().version();
logger.warn(() -> new ParameterizedMessage("failing [{}]: failed to commit cluster state version [{}]", clusterChangedEvent.source(), version), exception);
taskOutputs.publishingFailed((FailedToCommitClusterStateException) exception);
} else {
handleException(clusterChangedEvent.source(), startTimeMillis, clusterChangedEvent.state(), exception);
}
}
use of org.opensearch.cluster.coordination.FailedToCommitClusterStateException in project OpenSearch by opensearch-project.
the class ShardStateActionTests method testMasterChannelException.
public void testMasterChannelException() throws InterruptedException {
final String index = "test";
setState(clusterService, ClusterStateCreationUtils.stateWithActivePrimary(index, true, randomInt(5)));
CountDownLatch latch = new CountDownLatch(1);
AtomicInteger retries = new AtomicInteger();
AtomicBoolean success = new AtomicBoolean();
AtomicReference<Throwable> throwable = new AtomicReference<>();
LongConsumer retryLoop = requestId -> {
if (randomBoolean()) {
transport.handleRemoteError(requestId, randomFrom(new NotMasterException("simulated"), new FailedToCommitClusterStateException("simulated")));
} else {
if (randomBoolean()) {
transport.handleLocalError(requestId, new NodeNotConnectedException(null, "simulated"));
} else {
transport.handleError(requestId, new NodeDisconnectedException(null, ShardStateAction.SHARD_FAILED_ACTION_NAME));
}
}
};
final int numberOfRetries = randomIntBetween(1, 256);
setUpMasterRetryVerification(numberOfRetries, retries, latch, retryLoop);
ShardRouting failedShard = getRandomShardRouting(index);
shardStateAction.localShardFailed(failedShard, "test", getSimulatedFailure(), new ActionListener<Void>() {
@Override
public void onResponse(Void aVoid) {
success.set(true);
latch.countDown();
}
@Override
public void onFailure(Exception e) {
success.set(false);
throwable.set(e);
latch.countDown();
assert false;
}
});
final CapturingTransport.CapturedRequest[] capturedRequests = transport.getCapturedRequestsAndClear();
assertThat(capturedRequests.length, equalTo(1));
assertFalse(success.get());
assertThat(retries.get(), equalTo(0));
retryLoop.accept(capturedRequests[0].requestId);
latch.await();
assertNull(throwable.get());
assertThat(retries.get(), equalTo(numberOfRetries));
assertTrue(success.get());
}
Aggregations