use of org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse in project elasticsearch by elastic.
the class CorruptedFileIT method testCorruptFileAndRecover.
/**
* Tests that we can actually recover from a corruption on the primary given that we have replica shards around.
*/
public void testCorruptFileAndRecover() throws ExecutionException, InterruptedException, IOException {
int numDocs = scaledRandomIntBetween(100, 1000);
// have enough space for 3 copies
internalCluster().ensureAtLeastNumDataNodes(3);
if (cluster().numDataNodes() == 3) {
logger.info("--> cluster has [3] data nodes, corrupted primary will be overwritten");
}
assertThat(cluster().numDataNodes(), greaterThanOrEqualTo(3));
assertAcked(prepareCreate("test").setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, "1").put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "1").put(MergePolicyConfig.INDEX_MERGE_ENABLED, false).put(MockFSIndexStore.INDEX_CHECK_INDEX_ON_CLOSE_SETTING.getKey(), // no checkindex - we corrupt shards on purpose
false).put(IndexSettings.INDEX_TRANSLOG_FLUSH_THRESHOLD_SIZE_SETTING.getKey(), // no translog based flush - it might change the .liv / segments.N files
new ByteSizeValue(1, ByteSizeUnit.PB))));
ensureGreen();
disableAllocation("test");
IndexRequestBuilder[] builders = new IndexRequestBuilder[numDocs];
for (int i = 0; i < builders.length; i++) {
builders[i] = client().prepareIndex("test", "type").setSource("field", "value");
}
indexRandom(true, builders);
ensureGreen();
assertAllSuccessful(client().admin().indices().prepareFlush().setForce(true).execute().actionGet());
// we have to flush at least once here since we don't corrupt the translog
SearchResponse countResponse = client().prepareSearch().setSize(0).get();
assertHitCount(countResponse, numDocs);
final int numShards = numShards("test");
ShardRouting corruptedShardRouting = corruptRandomPrimaryFile();
logger.info("--> {} corrupted", corruptedShardRouting);
enableAllocation("test");
/*
* we corrupted the primary shard - now lets make sure we never recover from it successfully
*/
Settings build = Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "2").build();
client().admin().indices().prepareUpdateSettings("test").setSettings(build).get();
ClusterHealthResponse health = client().admin().cluster().health(Requests.clusterHealthRequest("test").waitForGreenStatus().timeout(// sometimes due to cluster rebalacing and random settings default timeout is just not enough.
"5m").waitForNoRelocatingShards(true)).actionGet();
if (health.isTimedOut()) {
logger.info("cluster state:\n{}\n{}", client().admin().cluster().prepareState().get().getState(), client().admin().cluster().preparePendingClusterTasks().get());
assertThat("timed out waiting for green state", health.isTimedOut(), equalTo(false));
}
assertThat(health.getStatus(), equalTo(ClusterHealthStatus.GREEN));
final int numIterations = scaledRandomIntBetween(5, 20);
for (int i = 0; i < numIterations; i++) {
SearchResponse response = client().prepareSearch().setSize(numDocs).get();
assertHitCount(response, numDocs);
}
/*
* now hook into the IndicesService and register a close listener to
* run the checkindex. if the corruption is still there we will catch it.
*/
// primary + 2 replicas
final CountDownLatch latch = new CountDownLatch(numShards * 3);
final CopyOnWriteArrayList<Exception> exception = new CopyOnWriteArrayList<>();
final IndexEventListener listener = new IndexEventListener() {
@Override
public void afterIndexShardClosed(ShardId sid, @Nullable IndexShard indexShard, Settings indexSettings) {
if (indexShard != null) {
Store store = indexShard.store();
store.incRef();
try {
if (!Lucene.indexExists(store.directory()) && indexShard.state() == IndexShardState.STARTED) {
return;
}
try (CheckIndex checkIndex = new CheckIndex(store.directory())) {
BytesStreamOutput os = new BytesStreamOutput();
PrintStream out = new PrintStream(os, false, StandardCharsets.UTF_8.name());
checkIndex.setInfoStream(out);
out.flush();
CheckIndex.Status status = checkIndex.checkIndex();
if (!status.clean) {
logger.warn("check index [failure]\n{}", os.bytes().utf8ToString());
throw new IOException("index check failure");
}
}
} catch (Exception e) {
exception.add(e);
} finally {
store.decRef();
latch.countDown();
}
}
}
};
for (MockIndexEventListener.TestEventListener eventListener : internalCluster().getDataNodeInstances(MockIndexEventListener.TestEventListener.class)) {
eventListener.setNewDelegate(listener);
}
try {
client().admin().indices().prepareDelete("test").get();
latch.await();
assertThat(exception, empty());
} finally {
for (MockIndexEventListener.TestEventListener eventListener : internalCluster().getDataNodeInstances(MockIndexEventListener.TestEventListener.class)) {
eventListener.setNewDelegate(null);
}
}
}
use of org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse in project elasticsearch by elastic.
the class CorruptedFileIT method testCorruptPrimaryNoReplica.
/**
* Tests corruption that happens on a single shard when no replicas are present. We make sure that the primary stays unassigned
* and all other replicas for the healthy shards happens
*/
public void testCorruptPrimaryNoReplica() throws ExecutionException, InterruptedException, IOException {
int numDocs = scaledRandomIntBetween(100, 1000);
internalCluster().ensureAtLeastNumDataNodes(2);
assertAcked(prepareCreate("test").setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "0").put(MergePolicyConfig.INDEX_MERGE_ENABLED, false).put(MockFSIndexStore.INDEX_CHECK_INDEX_ON_CLOSE_SETTING.getKey(), // no checkindex - we corrupt shards on purpose
false).put(IndexSettings.INDEX_TRANSLOG_FLUSH_THRESHOLD_SIZE_SETTING.getKey(), // no translog based flush - it might change the .liv / segments.N files
new ByteSizeValue(1, ByteSizeUnit.PB))));
ensureGreen();
IndexRequestBuilder[] builders = new IndexRequestBuilder[numDocs];
for (int i = 0; i < builders.length; i++) {
builders[i] = client().prepareIndex("test", "type").setSource("field", "value");
}
indexRandom(true, builders);
ensureGreen();
assertAllSuccessful(client().admin().indices().prepareFlush().setForce(true).execute().actionGet());
// we have to flush at least once here since we don't corrupt the translog
SearchResponse countResponse = client().prepareSearch().setSize(0).get();
assertHitCount(countResponse, numDocs);
ShardRouting shardRouting = corruptRandomPrimaryFile();
/*
* we corrupted the primary shard - now lets make sure we never recover from it successfully
*/
Settings build = Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "1").build();
client().admin().indices().prepareUpdateSettings("test").setSettings(build).get();
client().admin().cluster().prepareReroute().get();
boolean didClusterTurnRed = awaitBusy(() -> {
ClusterHealthStatus test = client().admin().cluster().health(Requests.clusterHealthRequest("test")).actionGet().getStatus();
return test == ClusterHealthStatus.RED;
}, 5, // sometimes on slow nodes the replication / recovery is just dead slow
TimeUnit.MINUTES);
final ClusterHealthResponse response = client().admin().cluster().health(Requests.clusterHealthRequest("test")).get();
if (response.getStatus() != ClusterHealthStatus.RED) {
logger.info("Cluster turned red in busy loop: {}", didClusterTurnRed);
logger.info("cluster state:\n{}\n{}", client().admin().cluster().prepareState().get().getState(), client().admin().cluster().preparePendingClusterTasks().get());
}
assertThat(response.getStatus(), is(ClusterHealthStatus.RED));
ClusterState state = client().admin().cluster().prepareState().get().getState();
GroupShardsIterator shardIterators = state.getRoutingTable().activePrimaryShardsGrouped(new String[] { "test" }, false);
for (ShardIterator iterator : shardIterators) {
ShardRouting routing;
while ((routing = iterator.nextOrNull()) != null) {
if (routing.getId() == shardRouting.getId()) {
assertThat(routing.state(), equalTo(ShardRoutingState.UNASSIGNED));
} else {
assertThat(routing.state(), anyOf(equalTo(ShardRoutingState.RELOCATING), equalTo(ShardRoutingState.STARTED)));
}
}
}
final List<Path> files = listShardFiles(shardRouting);
Path corruptedFile = null;
for (Path file : files) {
if (file.getFileName().toString().startsWith("corrupted_")) {
corruptedFile = file;
break;
}
}
assertThat(corruptedFile, notNullValue());
}
use of org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse in project elasticsearch by elastic.
the class IndicesStoreIntegrationIT method testIndexCleanup.
public void testIndexCleanup() throws Exception {
final String masterNode = internalCluster().startNode(Settings.builder().put(Node.NODE_DATA_SETTING.getKey(), false));
final String node_1 = internalCluster().startNode(Settings.builder().put(Node.NODE_MASTER_SETTING.getKey(), false));
final String node_2 = internalCluster().startNode(Settings.builder().put(Node.NODE_MASTER_SETTING.getKey(), false));
logger.info("--> creating index [test] with one shard and on replica");
assertAcked(prepareCreate("test").setSettings(Settings.builder().put(indexSettings()).put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1).put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1)));
ensureGreen("test");
ClusterState state = client().admin().cluster().prepareState().get().getState();
Index index = state.metaData().index("test").getIndex();
logger.info("--> making sure that shard and its replica are allocated on node_1 and node_2");
assertThat(Files.exists(shardDirectory(node_1, index, 0)), equalTo(true));
assertThat(Files.exists(indexDirectory(node_1, index)), equalTo(true));
assertThat(Files.exists(shardDirectory(node_2, index, 0)), equalTo(true));
assertThat(Files.exists(indexDirectory(node_2, index)), equalTo(true));
logger.info("--> starting node server3");
final String node_3 = internalCluster().startNode(Settings.builder().put(Node.NODE_MASTER_SETTING.getKey(), false));
logger.info("--> running cluster_health");
ClusterHealthResponse clusterHealth = client().admin().cluster().prepareHealth().setWaitForNodes("4").setWaitForNoRelocatingShards(true).get();
assertThat(clusterHealth.isTimedOut(), equalTo(false));
assertThat(Files.exists(shardDirectory(node_1, index, 0)), equalTo(true));
assertThat(Files.exists(indexDirectory(node_1, index)), equalTo(true));
assertThat(Files.exists(shardDirectory(node_2, index, 0)), equalTo(true));
assertThat(Files.exists(indexDirectory(node_2, index)), equalTo(true));
assertThat(Files.exists(shardDirectory(node_3, index, 0)), equalTo(false));
assertThat(Files.exists(indexDirectory(node_3, index)), equalTo(false));
logger.info("--> move shard from node_1 to node_3, and wait for relocation to finish");
if (randomBoolean()) {
// sometimes add cluster-state delay to trigger observers in IndicesStore.ShardActiveRequestHandler
BlockClusterStateProcessing disruption = relocateAndBlockCompletion(logger, "test", 0, node_1, node_3);
// wait a little so that cluster state observer is registered
sleep(50);
logger.info("--> stopping disruption");
disruption.stopDisrupting();
} else {
internalCluster().client().admin().cluster().prepareReroute().add(new MoveAllocationCommand("test", 0, node_1, node_3)).get();
}
clusterHealth = client().admin().cluster().prepareHealth().setWaitForNoRelocatingShards(true).get();
assertThat(clusterHealth.isTimedOut(), equalTo(false));
assertThat(waitForShardDeletion(node_1, index, 0), equalTo(false));
assertThat(waitForIndexDeletion(node_1, index), equalTo(false));
assertThat(Files.exists(shardDirectory(node_2, index, 0)), equalTo(true));
assertThat(Files.exists(indexDirectory(node_2, index)), equalTo(true));
assertThat(Files.exists(shardDirectory(node_3, index, 0)), equalTo(true));
assertThat(Files.exists(indexDirectory(node_3, index)), equalTo(true));
}
use of org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse in project elasticsearch by elastic.
the class SimpleNodesInfoIT method testAllocatedProcessors.
public void testAllocatedProcessors() throws Exception {
List<String> nodesIds = internalCluster().startNodes(Settings.builder().put(EsExecutors.PROCESSORS_SETTING.getKey(), 3).build(), Settings.builder().put(EsExecutors.PROCESSORS_SETTING.getKey(), 6).build());
final String node_1 = nodesIds.get(0);
final String node_2 = nodesIds.get(1);
ClusterHealthResponse clusterHealth = client().admin().cluster().prepareHealth().setWaitForGreenStatus().setWaitForNodes("2").get();
logger.info("--> done cluster_health, status {}", clusterHealth.getStatus());
String server1NodeId = internalCluster().getInstance(ClusterService.class, node_1).state().nodes().getLocalNodeId();
String server2NodeId = internalCluster().getInstance(ClusterService.class, node_2).state().nodes().getLocalNodeId();
logger.info("--> started nodes: {} and {}", server1NodeId, server2NodeId);
NodesInfoResponse response = client().admin().cluster().prepareNodesInfo().execute().actionGet();
assertThat(response.getNodes().size(), is(2));
assertThat(response.getNodesMap().get(server1NodeId), notNullValue());
assertThat(response.getNodesMap().get(server2NodeId), notNullValue());
assertThat(response.getNodesMap().get(server1NodeId).getOs().getAvailableProcessors(), equalTo(Runtime.getRuntime().availableProcessors()));
assertThat(response.getNodesMap().get(server2NodeId).getOs().getAvailableProcessors(), equalTo(Runtime.getRuntime().availableProcessors()));
assertThat(response.getNodesMap().get(server1NodeId).getOs().getAllocatedProcessors(), equalTo(3));
assertThat(response.getNodesMap().get(server2NodeId).getOs().getAllocatedProcessors(), equalTo(6));
}
use of org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse in project elasticsearch by elastic.
the class SimpleNodesInfoIT method testNodesInfos.
public void testNodesInfos() throws Exception {
List<String> nodesIds = internalCluster().startNodes(2);
final String node_1 = nodesIds.get(0);
final String node_2 = nodesIds.get(1);
ClusterHealthResponse clusterHealth = client().admin().cluster().prepareHealth().setWaitForGreenStatus().setWaitForNodes("2").get();
logger.info("--> done cluster_health, status {}", clusterHealth.getStatus());
String server1NodeId = internalCluster().getInstance(ClusterService.class, node_1).state().nodes().getLocalNodeId();
String server2NodeId = internalCluster().getInstance(ClusterService.class, node_2).state().nodes().getLocalNodeId();
logger.info("--> started nodes: {} and {}", server1NodeId, server2NodeId);
NodesInfoResponse response = client().admin().cluster().prepareNodesInfo().execute().actionGet();
assertThat(response.getNodes().size(), is(2));
assertThat(response.getNodesMap().get(server1NodeId), notNullValue());
assertThat(response.getNodesMap().get(server2NodeId), notNullValue());
response = client().admin().cluster().nodesInfo(nodesInfoRequest()).actionGet();
assertThat(response.getNodes().size(), is(2));
assertThat(response.getNodesMap().get(server1NodeId), notNullValue());
assertThat(response.getNodesMap().get(server2NodeId), notNullValue());
response = client().admin().cluster().nodesInfo(nodesInfoRequest(server1NodeId)).actionGet();
assertThat(response.getNodes().size(), is(1));
assertThat(response.getNodesMap().get(server1NodeId), notNullValue());
response = client().admin().cluster().nodesInfo(nodesInfoRequest(server1NodeId)).actionGet();
assertThat(response.getNodes().size(), is(1));
assertThat(response.getNodesMap().get(server1NodeId), notNullValue());
response = client().admin().cluster().nodesInfo(nodesInfoRequest(server2NodeId)).actionGet();
assertThat(response.getNodes().size(), is(1));
assertThat(response.getNodesMap().get(server2NodeId), notNullValue());
response = client().admin().cluster().nodesInfo(nodesInfoRequest(server2NodeId)).actionGet();
assertThat(response.getNodes().size(), is(1));
assertThat(response.getNodesMap().get(server2NodeId), notNullValue());
}
Aggregations