Search in sources :

Example 1 with FailedShard

use of org.opensearch.cluster.routing.allocation.FailedShard in project OpenSearch by opensearch-project.

the class TestGatewayAllocator method applyFailedShards.

@Override
public void applyFailedShards(List<FailedShard> failedShards, RoutingAllocation allocation) {
    currentNodes = allocation.nodes();
    for (FailedShard failedShard : failedShards) {
        final ShardRouting failedRouting = failedShard.getRoutingEntry();
        Map<ShardId, ShardRouting> nodeAllocations = knownAllocations.get(failedRouting.currentNodeId());
        if (nodeAllocations != null) {
            nodeAllocations.remove(failedRouting.shardId());
            if (nodeAllocations.isEmpty()) {
                knownAllocations.remove(failedRouting.currentNodeId());
            }
        }
    }
}
Also used : ShardId(org.opensearch.index.shard.ShardId) FailedShard(org.opensearch.cluster.routing.allocation.FailedShard) ShardRouting(org.opensearch.cluster.routing.ShardRouting)

Example 2 with FailedShard

use of org.opensearch.cluster.routing.allocation.FailedShard in project OpenSearch by opensearch-project.

the class IndicesClusterStateServiceRandomUpdatesTests method randomlyUpdateClusterState.

public ClusterState randomlyUpdateClusterState(ClusterState state, Map<DiscoveryNode, IndicesClusterStateService> clusterStateServiceMap, Supplier<MockIndicesService> indicesServiceSupplier) {
    // randomly remove no_master blocks
    if (randomBoolean() && state.blocks().hasGlobalBlockWithId(NoMasterBlockService.NO_MASTER_BLOCK_ID)) {
        state = ClusterState.builder(state).blocks(ClusterBlocks.builder().blocks(state.blocks()).removeGlobalBlock(NoMasterBlockService.NO_MASTER_BLOCK_ID)).build();
    }
    // randomly add no_master blocks
    if (rarely() && state.blocks().hasGlobalBlockWithId(NoMasterBlockService.NO_MASTER_BLOCK_ID) == false) {
        ClusterBlock block = randomBoolean() ? NoMasterBlockService.NO_MASTER_BLOCK_ALL : NoMasterBlockService.NO_MASTER_BLOCK_WRITES;
        state = ClusterState.builder(state).blocks(ClusterBlocks.builder().blocks(state.blocks()).addGlobalBlock(block)).build();
    }
    // if no_master block is in place, make no other cluster state changes
    if (state.blocks().hasGlobalBlockWithId(NoMasterBlockService.NO_MASTER_BLOCK_ID)) {
        return state;
    }
    // randomly create new indices (until we have 200 max)
    for (int i = 0; i < randomInt(5); i++) {
        if (state.metadata().indices().size() > 200) {
            break;
        }
        String name = "index_" + randomAlphaOfLength(15).toLowerCase(Locale.ROOT);
        Settings.Builder settingsBuilder = Settings.builder().put(SETTING_NUMBER_OF_SHARDS, randomIntBetween(1, 3));
        if (randomBoolean()) {
            int min = randomInt(2);
            int max = min + randomInt(3);
            settingsBuilder.put(SETTING_AUTO_EXPAND_REPLICAS, randomBoolean() ? min + "-" + max : min + "-all");
        } else {
            settingsBuilder.put(SETTING_NUMBER_OF_REPLICAS, randomInt(2));
        }
        CreateIndexRequest request = new CreateIndexRequest(name, settingsBuilder.build()).waitForActiveShards(ActiveShardCount.NONE);
        state = cluster.createIndex(state, request);
        assertTrue(state.metadata().hasIndex(name));
    }
    // randomly delete indices
    Set<String> indicesToDelete = new HashSet<>();
    int numberOfIndicesToDelete = randomInt(Math.min(2, state.metadata().indices().size()));
    for (String index : randomSubsetOf(numberOfIndicesToDelete, state.metadata().indices().keys().toArray(String.class))) {
        indicesToDelete.add(state.metadata().index(index).getIndex().getName());
    }
    if (indicesToDelete.isEmpty() == false) {
        DeleteIndexRequest deleteRequest = new DeleteIndexRequest(indicesToDelete.toArray(new String[indicesToDelete.size()]));
        state = cluster.deleteIndices(state, deleteRequest);
        for (String index : indicesToDelete) {
            assertFalse(state.metadata().hasIndex(index));
        }
    }
    // randomly close indices
    int numberOfIndicesToClose = randomInt(Math.min(1, state.metadata().indices().size()));
    for (String index : randomSubsetOf(numberOfIndicesToClose, state.metadata().indices().keys().toArray(String.class))) {
        CloseIndexRequest closeIndexRequest = new CloseIndexRequest(state.metadata().index(index).getIndex().getName());
        state = cluster.closeIndices(state, closeIndexRequest);
    }
    // randomly open indices
    int numberOfIndicesToOpen = randomInt(Math.min(1, state.metadata().indices().size()));
    for (String index : randomSubsetOf(numberOfIndicesToOpen, state.metadata().indices().keys().toArray(String.class))) {
        OpenIndexRequest openIndexRequest = new OpenIndexRequest(state.metadata().index(index).getIndex().getName());
        state = cluster.openIndices(state, openIndexRequest);
    }
    // randomly update settings
    Set<String> indicesToUpdate = new HashSet<>();
    boolean containsClosedIndex = false;
    int numberOfIndicesToUpdate = randomInt(Math.min(2, state.metadata().indices().size()));
    for (String index : randomSubsetOf(numberOfIndicesToUpdate, state.metadata().indices().keys().toArray(String.class))) {
        indicesToUpdate.add(state.metadata().index(index).getIndex().getName());
        if (state.metadata().index(index).getState() == IndexMetadata.State.CLOSE) {
            containsClosedIndex = true;
        }
    }
    if (indicesToUpdate.isEmpty() == false) {
        UpdateSettingsRequest updateSettingsRequest = new UpdateSettingsRequest(indicesToUpdate.toArray(new String[indicesToUpdate.size()]));
        Settings.Builder settings = Settings.builder();
        if (containsClosedIndex == false) {
            settings.put(SETTING_NUMBER_OF_REPLICAS, randomInt(2));
        }
        settings.put("index.refresh_interval", randomIntBetween(1, 5) + "s");
        updateSettingsRequest.settings(settings.build());
        state = cluster.updateSettings(state, updateSettingsRequest);
    }
    // randomly reroute
    if (rarely()) {
        state = cluster.reroute(state, new ClusterRerouteRequest());
    }
    // randomly start and fail allocated shards
    final Map<ShardRouting, Long> startedShards = new HashMap<>();
    List<FailedShard> failedShards = new ArrayList<>();
    for (DiscoveryNode node : state.nodes()) {
        IndicesClusterStateService indicesClusterStateService = clusterStateServiceMap.get(node);
        MockIndicesService indicesService = (MockIndicesService) indicesClusterStateService.indicesService;
        for (MockIndexService indexService : indicesService) {
            for (MockIndexShard indexShard : indexService) {
                ShardRouting persistedShardRouting = indexShard.routingEntry();
                if (persistedShardRouting.initializing() && randomBoolean()) {
                    startedShards.put(persistedShardRouting, indexShard.term());
                } else if (rarely()) {
                    failedShards.add(new FailedShard(persistedShardRouting, "fake shard failure", new Exception(), randomBoolean()));
                }
            }
        }
    }
    state = cluster.applyFailedShards(state, failedShards);
    state = cluster.applyStartedShards(state, startedShards);
    // randomly add and remove nodes (except current master)
    if (rarely()) {
        if (randomBoolean()) {
            // add node
            if (state.nodes().getSize() < 10) {
                state = cluster.addNodes(state, Collections.singletonList(createNode()));
                updateNodes(state, clusterStateServiceMap, indicesServiceSupplier);
            }
        } else {
            // remove node
            if (state.nodes().getDataNodes().size() > 3) {
                DiscoveryNode discoveryNode = randomFrom(state.nodes().getNodes().values().toArray(DiscoveryNode.class));
                if (discoveryNode.equals(state.nodes().getMasterNode()) == false) {
                    state = cluster.removeNodes(state, Collections.singletonList(discoveryNode));
                    updateNodes(state, clusterStateServiceMap, indicesServiceSupplier);
                }
                if (randomBoolean()) {
                    // and add it back
                    state = cluster.addNodes(state, Collections.singletonList(discoveryNode));
                    updateNodes(state, clusterStateServiceMap, indicesServiceSupplier);
                }
            }
        }
    }
    return state;
}
Also used : DiscoveryNode(org.opensearch.cluster.node.DiscoveryNode) UpdateSettingsRequest(org.opensearch.action.admin.indices.settings.put.UpdateSettingsRequest) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) DeleteIndexRequest(org.opensearch.action.admin.indices.delete.DeleteIndexRequest) ClusterBlock(org.opensearch.cluster.block.ClusterBlock) ClusterRerouteRequest(org.opensearch.action.admin.cluster.reroute.ClusterRerouteRequest) Settings(org.opensearch.common.settings.Settings) HashSet(java.util.HashSet) OpenIndexRequest(org.opensearch.action.admin.indices.open.OpenIndexRequest) FailedShard(org.opensearch.cluster.routing.allocation.FailedShard) CloseIndexRequest(org.opensearch.action.admin.indices.close.CloseIndexRequest) CreateIndexRequest(org.opensearch.action.admin.indices.create.CreateIndexRequest) ShardRouting(org.opensearch.cluster.routing.ShardRouting)

Example 3 with FailedShard

use of org.opensearch.cluster.routing.allocation.FailedShard in project OpenSearch by opensearch-project.

the class ClusterRerouteTests method testClusterStateUpdateTask.

public void testClusterStateUpdateTask() {
    AllocationService allocationService = new AllocationService(new AllocationDeciders(Collections.singleton(new MaxRetryAllocationDecider())), new TestGatewayAllocator(), new BalancedShardsAllocator(Settings.EMPTY), EmptyClusterInfoService.INSTANCE, EmptySnapshotsInfoService.INSTANCE);
    ClusterState clusterState = createInitialClusterState(allocationService);
    ClusterRerouteRequest req = new ClusterRerouteRequest();
    req.dryRun(true);
    AtomicReference<ClusterRerouteResponse> responseRef = new AtomicReference<>();
    ActionListener<ClusterRerouteResponse> responseActionListener = new ActionListener<ClusterRerouteResponse>() {

        @Override
        public void onResponse(ClusterRerouteResponse clusterRerouteResponse) {
            responseRef.set(clusterRerouteResponse);
        }

        @Override
        public void onFailure(Exception e) {
        }
    };
    TransportClusterRerouteAction.ClusterRerouteResponseAckedClusterStateUpdateTask task = new TransportClusterRerouteAction.ClusterRerouteResponseAckedClusterStateUpdateTask(logger, allocationService, req, responseActionListener);
    ClusterState execute = task.execute(clusterState);
    // dry-run
    assertSame(execute, clusterState);
    task.onAllNodesAcked(null);
    assertNotSame(responseRef.get().getState(), execute);
    // now we allocate
    req.dryRun(false);
    final int retries = MaxRetryAllocationDecider.SETTING_ALLOCATION_MAX_RETRY.get(Settings.EMPTY);
    // now fail it N-1 times
    for (int i = 0; i < retries; i++) {
        ClusterState newState = task.execute(clusterState);
        // dry-run=false
        assertNotSame(newState, clusterState);
        clusterState = newState;
        RoutingTable routingTable = clusterState.routingTable();
        assertEquals(routingTable.index("idx").shards().size(), 1);
        assertEquals(routingTable.index("idx").shard(0).shards().get(0).state(), INITIALIZING);
        assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations(), i);
        List<FailedShard> failedShards = Collections.singletonList(new FailedShard(routingTable.index("idx").shard(0).shards().get(0), "boom" + i, new UnsupportedOperationException(), randomBoolean()));
        newState = allocationService.applyFailedShards(clusterState, failedShards);
        assertThat(newState, not(equalTo(clusterState)));
        clusterState = newState;
        routingTable = clusterState.routingTable();
        assertEquals(routingTable.index("idx").shards().size(), 1);
        if (i == retries - 1) {
            assertEquals(routingTable.index("idx").shard(0).shards().get(0).state(), UNASSIGNED);
        } else {
            assertEquals(routingTable.index("idx").shard(0).shards().get(0).state(), INITIALIZING);
        }
        assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations(), i + 1);
    }
    // without retry_failed we won't allocate that shard
    ClusterState newState = task.execute(clusterState);
    // dry-run=false
    assertNotSame(newState, clusterState);
    task.onAllNodesAcked(null);
    assertSame(responseRef.get().getState(), newState);
    RoutingTable routingTable = clusterState.routingTable();
    assertEquals(routingTable.index("idx").shards().size(), 1);
    assertEquals(routingTable.index("idx").shard(0).shards().get(0).state(), UNASSIGNED);
    assertEquals(routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations(), retries);
    // now we manually retry and get the shard back into initializing
    req.setRetryFailed(true);
    newState = task.execute(clusterState);
    // dry-run=false
    assertNotSame(newState, clusterState);
    clusterState = newState;
    routingTable = clusterState.routingTable();
    assertEquals(1, routingTable.index("idx").shards().size());
    assertEquals(INITIALIZING, routingTable.index("idx").shard(0).shards().get(0).state());
    assertEquals(0, routingTable.index("idx").shard(0).shards().get(0).unassignedInfo().getNumFailedAllocations());
}
Also used : TestGatewayAllocator(org.opensearch.test.gateway.TestGatewayAllocator) ClusterState(org.opensearch.cluster.ClusterState) BalancedShardsAllocator(org.opensearch.cluster.routing.allocation.allocator.BalancedShardsAllocator) FailedShard(org.opensearch.cluster.routing.allocation.FailedShard) AtomicReference(java.util.concurrent.atomic.AtomicReference) AllocationDeciders(org.opensearch.cluster.routing.allocation.decider.AllocationDeciders) MaxRetryAllocationDecider(org.opensearch.cluster.routing.allocation.decider.MaxRetryAllocationDecider) IOException(java.io.IOException) ActionListener(org.opensearch.action.ActionListener) RoutingTable(org.opensearch.cluster.routing.RoutingTable) AllocationService(org.opensearch.cluster.routing.allocation.AllocationService)

Example 4 with FailedShard

use of org.opensearch.cluster.routing.allocation.FailedShard in project OpenSearch by opensearch-project.

the class PrimaryTermsTests method failSomePrimaries.

private void failSomePrimaries(String index) {
    final IndexRoutingTable indexShardRoutingTable = clusterState.routingTable().index(index);
    Set<Integer> shardIdsToFail = new HashSet<>();
    for (int i = 1 + randomInt(numberOfShards - 1); i > 0; i--) {
        shardIdsToFail.add(randomInt(numberOfShards - 1));
    }
    logger.info("failing primary shards {} for index [{}]", shardIdsToFail, index);
    List<FailedShard> failedShards = new ArrayList<>();
    for (int shard : shardIdsToFail) {
        failedShards.add(new FailedShard(indexShardRoutingTable.shard(shard).primaryShard(), "test", null, randomBoolean()));
        // the primary failure should increment the primary term;
        incrementPrimaryTerm(index, shard);
    }
    applyRerouteResult(allocationService.applyFailedShards(this.clusterState, failedShards, Collections.emptyList()));
}
Also used : FailedShard(org.opensearch.cluster.routing.allocation.FailedShard) ArrayList(java.util.ArrayList) HashSet(java.util.HashSet)

Example 5 with FailedShard

use of org.opensearch.cluster.routing.allocation.FailedShard in project OpenSearch by opensearch-project.

the class UnassignedInfoTests method testFailedShard.

/**
 * Verifies that when a shard fails, reason is properly set and details are preserved.
 */
public void testFailedShard() {
    AllocationService allocation = createAllocationService();
    Metadata metadata = Metadata.builder().put(IndexMetadata.builder("test").settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(1)).build();
    ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)).metadata(metadata).routingTable(RoutingTable.builder().addAsNew(metadata.index("test")).build()).build();
    clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))).build();
    clusterState = allocation.reroute(clusterState, "reroute");
    // starting primaries
    clusterState = startInitializingShardsAndReroute(allocation, clusterState);
    // starting replicas
    clusterState = startInitializingShardsAndReroute(allocation, clusterState);
    assertThat(clusterState.getRoutingNodes().unassigned().size() > 0, equalTo(false));
    // fail shard
    ShardRouting shardToFail = clusterState.getRoutingNodes().shardsWithState(STARTED).get(0);
    clusterState = allocation.applyFailedShards(clusterState, Collections.singletonList(new FailedShard(shardToFail, "test fail", null, randomBoolean())));
    // verify the reason and details
    assertThat(clusterState.getRoutingNodes().unassigned().size() > 0, equalTo(true));
    assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).size(), equalTo(1));
    assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo(), notNullValue());
    assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getReason(), equalTo(UnassignedInfo.Reason.ALLOCATION_FAILED));
    assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getMessage(), equalTo("failed shard on node [" + shardToFail.currentNodeId() + "]: test fail"));
    assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getDetails(), equalTo("failed shard on node [" + shardToFail.currentNodeId() + "]: test fail"));
    assertThat(clusterState.getRoutingNodes().shardsWithState(UNASSIGNED).get(0).unassignedInfo().getUnassignedTimeInMillis(), greaterThan(0L));
}
Also used : ClusterState(org.opensearch.cluster.ClusterState) Metadata(org.opensearch.cluster.metadata.Metadata) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata) FailedShard(org.opensearch.cluster.routing.allocation.FailedShard) AllocationService(org.opensearch.cluster.routing.allocation.AllocationService)

Aggregations

FailedShard (org.opensearch.cluster.routing.allocation.FailedShard)6 ArrayList (java.util.ArrayList)2 HashSet (java.util.HashSet)2 ClusterState (org.opensearch.cluster.ClusterState)2 ShardRouting (org.opensearch.cluster.routing.ShardRouting)2 AllocationService (org.opensearch.cluster.routing.allocation.AllocationService)2 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1 AtomicReference (java.util.concurrent.atomic.AtomicReference)1 ActionListener (org.opensearch.action.ActionListener)1 ClusterRerouteRequest (org.opensearch.action.admin.cluster.reroute.ClusterRerouteRequest)1 CloseIndexRequest (org.opensearch.action.admin.indices.close.CloseIndexRequest)1 CreateIndexRequest (org.opensearch.action.admin.indices.create.CreateIndexRequest)1 DeleteIndexRequest (org.opensearch.action.admin.indices.delete.DeleteIndexRequest)1 OpenIndexRequest (org.opensearch.action.admin.indices.open.OpenIndexRequest)1 UpdateSettingsRequest (org.opensearch.action.admin.indices.settings.put.UpdateSettingsRequest)1 ClusterBlock (org.opensearch.cluster.block.ClusterBlock)1 IndexMetadata (org.opensearch.cluster.metadata.IndexMetadata)1 Metadata (org.opensearch.cluster.metadata.Metadata)1 DiscoveryNode (org.opensearch.cluster.node.DiscoveryNode)1