use of org.elasticsearch.test.disruption.NetworkDisruption in project crate by crate.
the class MasterDisruptionIT method testIsolateMasterAndVerifyClusterStateConsensus.
/**
* This test isolates the master from rest of the cluster, waits for a new master to be elected, restores the partition
* and verifies that all node agree on the new cluster state
*/
@TestLogging("_root:DEBUG," + "org.elasticsearch.cluster.service:TRACE," + "org.elasticsearch.gateway:TRACE," + "org.elasticsearch.indices.store:TRACE")
@Test
public void testIsolateMasterAndVerifyClusterStateConsensus() throws Exception {
final List<String> nodes = startCluster(3);
int numberOfShards = 1 + randomInt(2);
int numberOfReplicas = randomInt(2);
logger.info("creating table t with {} shards and {} replicas", numberOfShards, numberOfReplicas);
execute("create table t (id int primary key, x string) clustered into " + numberOfShards + " shards with " + "(number_of_replicas = " + numberOfReplicas + " )");
ensureGreen();
String isolatedNode = internalCluster().getMasterName();
TwoPartitions partitions = isolateNode(isolatedNode);
NetworkDisruption networkDisruption = addRandomDisruptionType(partitions);
networkDisruption.startDisrupting();
String nonIsolatedNode = partitions.getMajoritySide().iterator().next();
// make sure cluster reforms
ensureStableCluster(2, nonIsolatedNode);
// make sure isolated need picks up on things.
assertNoMaster(isolatedNode, TimeValue.timeValueSeconds(40));
// restore isolation
networkDisruption.stopDisrupting();
for (String node : nodes) {
ensureStableCluster(3, new TimeValue(DISRUPTION_HEALING_OVERHEAD.millis() + networkDisruption.expectedTimeToHeal().millis()), true, node);
}
logger.info("issue a reroute");
// trigger a reroute now, instead of waiting for the background reroute of RerouteService
execute("ALTER CLUSTER REROUTE RETRY FAILED");
// and wait for it to finish and for the cluster to stabilize
ensureGreen();
// verify all cluster states are the same
// use assert busy to wait for cluster states to be applied (as publish_timeout has low value)
assertBusy(() -> {
ClusterState state = null;
for (String node : nodes) {
ClusterState nodeState = getNodeClusterState(node);
if (state == null) {
state = nodeState;
continue;
}
// assert nodes are identical
try {
assertEquals("unequal versions", state.version(), nodeState.version());
assertEquals("unequal node count", state.nodes().getSize(), nodeState.nodes().getSize());
assertEquals("different masters ", state.nodes().getMasterNodeId(), nodeState.nodes().getMasterNodeId());
assertEquals("different meta data version", state.metadata().version(), nodeState.metadata().version());
assertEquals("different routing", state.routingTable().toString(), nodeState.routingTable().toString());
} catch (AssertionError t) {
fail("failed comparing cluster state: " + t.getMessage() + "\n" + "--- cluster state of node [" + nodes.get(0) + "]: ---\n" + state + "\n--- cluster state [" + node + "]: ---\n" + nodeState);
}
}
});
}
use of org.elasticsearch.test.disruption.NetworkDisruption in project crate by crate.
the class SnapshotShardsServiceIT method testRetryPostingSnapshotStatusMessages.
public void testRetryPostingSnapshotStatusMessages() throws Exception {
String masterNode = internalCluster().startMasterOnlyNode();
String dataNode = internalCluster().startDataOnlyNode();
logger.info("--> creating repository");
assertAcked(client().admin().cluster().preparePutRepository("repo").setType("mock").setSettings(Settings.builder().put("location", randomRepoPath()).put("compress", randomBoolean()).put("chunk_size", randomIntBetween(100, 1000), ByteSizeUnit.BYTES)));
final int shards = between(1, 10);
execute("create table doc.test(x integer) clustered into ? shards with (number_of_replicas=0)", new Object[] { shards });
ensureGreen();
final int numDocs = scaledRandomIntBetween(50, 100);
for (int i = 0; i < numDocs; i++) {
execute("insert into doc.test values(?)", new Object[] { i });
}
logger.info("--> blocking repository");
String blockedNode = blockNodeWithIndex("repo", "test");
dataNodeClient().admin().cluster().prepareCreateSnapshot("repo", "snapshot").setWaitForCompletion(false).setIndices("test").get();
waitForBlock(blockedNode, "repo", TimeValue.timeValueSeconds(60));
final SnapshotId snapshotId = client().admin().cluster().prepareGetSnapshots("repo").setSnapshots("snapshot").get().getSnapshots().get(0).snapshotId();
logger.info("--> start disrupting cluster");
final NetworkDisruption networkDisruption = new NetworkDisruption(new NetworkDisruption.TwoPartitions(masterNode, dataNode), NetworkDisruption.NetworkDelay.random(random()));
internalCluster().setDisruptionScheme(networkDisruption);
networkDisruption.startDisrupting();
logger.info("--> unblocking repository");
unblockNode("repo", blockedNode);
// Retrieve snapshot status from the data node.
SnapshotShardsService snapshotShardsService = internalCluster().getInstance(SnapshotShardsService.class, blockedNode);
assertBusy(() -> {
final Snapshot snapshot = new Snapshot("repo", snapshotId);
List<IndexShardSnapshotStatus.Stage> stages = snapshotShardsService.currentSnapshotShards(snapshot).values().stream().map(status -> status.asCopy().getStage()).collect(Collectors.toList());
assertThat(stages, hasSize(shards));
assertThat(stages, everyItem(equalTo(IndexShardSnapshotStatus.Stage.DONE)));
}, 30L, TimeUnit.SECONDS);
logger.info("--> stop disrupting cluster");
networkDisruption.stopDisrupting();
internalCluster().clearDisruptionScheme(true);
assertBusy(() -> {
execute("select state, array_length(failures,0) from sys.snapshots where name='snapshot'");
assertThat(response.rowCount(), is(1L));
assertThat(response.rows()[0][0], is("SUCCESS"));
assertThat(response.rows()[0][1], is(nullValue()));
}, 30L, TimeUnit.SECONDS);
}
Aggregations