use of org.opensearch.cluster.routing.allocation.FailedShard in project OpenSearch by opensearch-project.
the class TestGatewayAllocator method applyFailedShards.
@Override
public void applyFailedShards(List<FailedShard> failedShards, RoutingAllocation allocation) {
currentNodes = allocation.nodes();
for (FailedShard failedShard : failedShards) {
final ShardRouting failedRouting = failedShard.getRoutingEntry();
Map<ShardId, ShardRouting> nodeAllocations = knownAllocations.get(failedRouting.currentNodeId());
if (nodeAllocations != null) {
nodeAllocations.remove(failedRouting.shardId());
if (nodeAllocations.isEmpty()) {
knownAllocations.remove(failedRouting.currentNodeId());
}
}
}
}
use of org.opensearch.cluster.routing.allocation.FailedShard in project OpenSearch by opensearch-project.
the class IndicesClusterStateServiceRandomUpdatesTests method randomlyUpdateClusterState.
public ClusterState randomlyUpdateClusterState(ClusterState state, Map<DiscoveryNode, IndicesClusterStateService> clusterStateServiceMap, Supplier<MockIndicesService> indicesServiceSupplier) {
// randomly remove no_master blocks
if (randomBoolean() && state.blocks().hasGlobalBlockWithId(NoMasterBlockService.NO_MASTER_BLOCK_ID)) {
state = ClusterState.builder(state).blocks(ClusterBlocks.builder().blocks(state.blocks()).removeGlobalBlock(NoMasterBlockService.NO_MASTER_BLOCK_ID)).build();
}
// randomly add no_master blocks
if (rarely() && state.blocks().hasGlobalBlockWithId(NoMasterBlockService.NO_MASTER_BLOCK_ID) == false) {
ClusterBlock block = randomBoolean() ? NoMasterBlockService.NO_MASTER_BLOCK_ALL : NoMasterBlockService.NO_MASTER_BLOCK_WRITES;
state = ClusterState.builder(state).blocks(ClusterBlocks.builder().blocks(state.blocks()).addGlobalBlock(block)).build();
}
// if no_master block is in place, make no other cluster state changes
if (state.blocks().hasGlobalBlockWithId(NoMasterBlockService.NO_MASTER_BLOCK_ID)) {
return state;
}
// randomly create new indices (until we have 200 max)
for (int i = 0; i < randomInt(5); i++) {
if (state.metadata().indices().size() > 200) {
break;
}
String name = "index_" + randomAlphaOfLength(15).toLowerCase(Locale.ROOT);
Settings.Builder settingsBuilder = Settings.builder().put(SETTING_NUMBER_OF_SHARDS, randomIntBetween(1, 3));
if (randomBoolean()) {
int min = randomInt(2);
int max = min + randomInt(3);
settingsBuilder.put(SETTING_AUTO_EXPAND_REPLICAS, randomBoolean() ? min + "-" + max : min + "-all");
} else {
settingsBuilder.put(SETTING_NUMBER_OF_REPLICAS, randomInt(2));
}
CreateIndexRequest request = new CreateIndexRequest(name, settingsBuilder.build()).waitForActiveShards(ActiveShardCount.NONE);
state = cluster.createIndex(state, request);
assertTrue(state.metadata().hasIndex(name));
}
// randomly delete indices
Set<String> indicesToDelete = new HashSet<>();
int numberOfIndicesToDelete = randomInt(Math.min(2, state.metadata().indices().size()));
for (String index : randomSubsetOf(numberOfIndicesToDelete, state.metadata().indices().keys().toArray(String.class))) {
indicesToDelete.add(state.metadata().index(index).getIndex().getName());
}
if (indicesToDelete.isEmpty() == false) {
DeleteIndexRequest deleteRequest = new DeleteIndexRequest(indicesToDelete.toArray(new String[indicesToDelete.size()]));
state = cluster.deleteIndices(state, deleteRequest);
for (String index : indicesToDelete) {
assertFalse(state.metadata().hasIndex(index));
}
}
// randomly close indices
int numberOfIndicesToClose = randomInt(Math.min(1, state.metadata().indices().size()));
for (String index : randomSubsetOf(numberOfIndicesToClose, state.metadata().indices().keys().toArray(String.class))) {
CloseIndexRequest closeIndexRequest = new CloseIndexRequest(state.metadata().index(index).getIndex().getName());
state = cluster.closeIndices(state, closeIndexRequest);
}
// randomly open indices
int numberOfIndicesToOpen = randomInt(Math.min(1, state.metadata().indices().size()));
for (String index : randomSubsetOf(numberOfIndicesToOpen, state.metadata().indices().keys().toArray(String.class))) {
OpenIndexRequest openIndexRequest = new OpenIndexRequest(state.metadata().index(index).getIndex().getName());
state = cluster.openIndices(state, openIndexRequest);
}
// randomly update settings
Set<String> indicesToUpdate = new HashSet<>();
boolean containsClosedIndex = false;
int numberOfIndicesToUpdate = randomInt(Math.min(2, state.metadata().indices().size()));
for (String index : randomSubsetOf(numberOfIndicesToUpdate, state.metadata().indices().keys().toArray(String.class))) {
indicesToUpdate.add(state.metadata().index(index).getIndex().getName());
if (state.metadata().index(index).getState() == IndexMetadata.State.CLOSE) {
containsClosedIndex = true;
}
}
if (indicesToUpdate.isEmpty() == false) {
UpdateSettingsRequest updateSettingsRequest = new UpdateSettingsRequest(indicesToUpdate.toArray(new String[indicesToUpdate.size()]));
Settings.Builder settings = Settings.builder();
if (containsClosedIndex == false) {
settings.put(SETTING_NUMBER_OF_REPLICAS, randomInt(2));
}
settings.put("index.refresh_interval", randomIntBetween(1, 5) + "s");
updateSettingsRequest.settings(settings.build());
state = cluster.updateSettings(state, updateSettingsRequest);
}
// randomly reroute
if (rarely()) {
state = cluster.reroute(state, new ClusterRerouteRequest());
}
// randomly start and fail allocated shards
final Map<ShardRouting, Long> startedShards = new HashMap<>();
List<FailedShard> failedShards = new ArrayList<>();
for (DiscoveryNode node : state.nodes()) {
IndicesClusterStateService indicesClusterStateService = clusterStateServiceMap.get(node);
MockIndicesService indicesService = (MockIndicesService) indicesClusterStateService.indicesService;
for (MockIndexService indexService : indicesService) {
for (MockIndexShard indexShard : indexService) {
ShardRouting persistedShardRouting = indexShard.routingEntry();
if (persistedShardRouting.initializing() && randomBoolean()) {
startedShards.put(persistedShardRouting, indexShard.term());
} else if (rarely()) {
failedShards.add(new FailedShard(persistedShardRouting, "fake shard failure", new Exception(), randomBoolean()));
}
}
}
}
state = cluster.applyFailedShards(state, failedShards);
state = cluster.applyStartedShards(state, startedShards);
// randomly add and remove nodes (except current master)
if (rarely()) {
if (randomBoolean()) {
// add node
if (state.nodes().getSize() < 10) {
state = cluster.addNodes(state, Collections.singletonList(createNode()));
updateNodes(state, clusterStateServiceMap, indicesServiceSupplier);
}
} else {
// remove node
if (state.nodes().getDataNodes().size() > 3) {
DiscoveryNode discoveryNode = randomFrom(state.nodes().getNodes().values().toArray(DiscoveryNode.class));
if (discoveryNode.equals(state.nodes().getMasterNode()) == false) {
state = cluster.removeNodes(state, Collections.singletonList(discoveryNode));
updateNodes(state, clusterStateServiceMap, indicesServiceSupplier);
}
if (randomBoolean()) {
// and add it back
state = cluster.addNodes(state, Collections.singletonList(discoveryNode));
updateNodes(state, clusterStateServiceMap, indicesServiceSupplier);
}
}
}
}
return state;
}
use of org.opensearch.cluster.routing.allocation.FailedShard in project OpenSearch by opensearch-project.
the class ClusterRerouteTests method testClusterStateUpdateTask.
public void testClusterStateUpdateTask() {
AllocationService allocationService = new AllocationService(new AllocationDeciders(Collections.singleton(new MaxRetryAllocationDecider())), new TestGatewayAllocator(), new BalancedShardsAllocator(Settings.EMPTY), EmptyClusterInfoService.INSTANCE, EmptySnapshotsInfoService.INSTANCE);
ClusterState clusterState = createInitialClusterState(allocationService);
ClusterRerouteRequest req = new ClusterRerouteRequest();
req.dryRun(true);
AtomicReference<ClusterRerouteResponse> responseRef = new AtomicReference<>();
ActionListener<ClusterRerouteResponse> responseActionListener = new ActionListener<ClusterRerouteResponse>() {
@Override
public void onResponse(ClusterRerouteResponse clusterRerouteResponse) {
responseRef.set(clusterRerouteResponse);
}
@Override
public void onFailure(Exception e) {
}
};
TransportClusterRerouteAction.ClusterRerouteResponseAckedClusterStateUpdateTask task = new TransportClusterRerouteAction.ClusterRerouteResponseAckedClusterStateUpdateTask(logger, allocationService, req, responseActionListener);
ClusterState execute = task.execute(clusterState);
// dry-run
assertSame(execute, clusterState);
task.onAllNodesAcked(null);
assertNotSame(responseRef.get().getState(), execute);
// now we allocate
req.dryRun(false);
final int retries = MaxRetryAllocationDecider.SETTING_ALLOCATION_MAX_RETRY.get(Settings.EMPTY);
// now fail it N-1 times
for (int i = 0; i < retries; i++) {
ClusterState newState = task.execute(clusterState);
// dry-run=false
assertNotSame(newState, clusterState);
clusterState = newState;
RoutingTable routingTable = clusterState.routingTable();
assertEquals(routingTable.index("idx").shards().size(), 1);
assertEquals(routingTable.index("idx").shard(0).shards().get(0).state(), INITIALIZING);
assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations(), i);
List<FailedShard> failedShards = Collections.singletonList(new FailedShard(routingTable.index("idx").shard(0).shards().get(0), "boom" + i, new UnsupportedOperationException(), randomBoolean()));
newState = allocationService.applyFailedShards(clusterState, failedShards);
assertThat(newState, not(equalTo(clusterState)));
clusterState = newState;
routingTable = clusterState.routingTable();
assertEquals(routingTable.index("idx").shards().size(), 1);
if (i == retries - 1) {
assertEquals(routingTable.index("idx").shard(0).shards().get(0).state(), UNASSIGNED);
} else {
assertEquals(routingTable.index("idx").shard(0).shards().get(0).state(), INITIALIZING);
}
assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations(), i + 1);
}
// without retry_failed we won't allocate that shard
ClusterState newState = task.execute(clusterState);
// dry-run=false
assertNotSame(newState, clusterState);
task.onAllNodesAcked(null);
assertSame(responseRef.get().getState(), newState);
RoutingTable routingTable = clusterState.routingTable();
assertEquals(routingTable.index("idx").shards().size(), 1);
assertEquals(routingTable.index("idx").shard(0).shards().get(0).state(), UNASSIGNED);
assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations(), retries);
// now we manually retry and get the shard back into initializing
req.setRetryFailed(true);
newState = task.execute(clusterState);
// dry-run=false
assertNotSame(newState, clusterState);
clusterState = newState;
routingTable = clusterState.routingTable();
assertEquals(1, routingTable.index("idx").shards().size());
assertEquals(INITIALIZING, routingTable.index("idx").shard(0).shards().get(0).state());
assertEquals(0, routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations());
}
use of org.opensearch.cluster.routing.allocation.FailedShard in project OpenSearch by opensearch-project.
the class PrimaryTermsTests method failSomePrimaries.
private void failSomePrimaries(String index) {
final IndexRoutingTable indexShardRoutingTable = clusterState.routingTable().index(index);
Set<Integer> shardIdsToFail = new HashSet<>();
for (int i = 1 + randomInt(numberOfShards - 1); i > 0; i--) {
shardIdsToFail.add(randomInt(numberOfShards - 1));
}
logger.info("failing primary shards {} for index [{}]", shardIdsToFail, index);
List<FailedShard> failedShards = new ArrayList<>();
for (int shard : shardIdsToFail) {
failedShards.add(new FailedShard(indexShardRoutingTable.shard(shard).primaryShard(), "test", null, randomBoolean()));
// the primary failure should increment the primary term;
incrementPrimaryTerm(index, shard);
}
applyRerouteResult(allocationService.applyFailedShards(this.clusterState, failedShards, Collections.emptyList()));
}
use of org.opensearch.cluster.routing.allocation.FailedShard in project OpenSearch by opensearch-project.
the class UnassignedInfoTests method testFailedShard.
/**
* Verifies that when a shard fails, reason is properly set and details are preserved.
*/
public void testFailedShard() {
AllocationService allocation = createAllocationService();
Metadata metadata = Metadata.builder().put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(1)).build();
ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)).metadata(metadata).routingTable(RoutingTable.builder().addAsNew(metadata.index("test")).build()).build();
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))).build();
clusterState = allocation.reroute(clusterState, "reroute");
// starting primaries
clusterState = startInitializingShardsAndReroute(allocation, clusterState);
// starting replicas
clusterState = startInitializingShardsAndReroute(allocation, clusterState);
assertThat(clusterState.getRoutingNodes().unassigned().size() > 0, equalTo(false));
// fail shard
ShardRouting shardToFail = clusterState.getRoutingNodes().shardsWithState(STARTED).get(0);
clusterState = allocation.applyFailedShards(clusterState, Collections.singletonList(new FailedShard(shardToFail, "test fail", null, randomBoolean())));
// verify the reason and details
assertThat(clusterState.getRoutingNodes().unassigned().size() > 0, equalTo(true));
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(1));
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo(), notNullValue());
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getReason(), equalTo(UnassignedInfo.Reason.ALLOCATION_FAILED));
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getMessage(), equalTo("failed shard on node [" + shardToFail.currentNodeId() + "]: test fail"));
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getDetails(), equalTo("failed shard on node [" + shardToFail.currentNodeId() + "]: test fail"));
assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getUnassignedTimeInMillis(), greaterThan(0L));
}
Aggregations