Search in sources :

Example 1 with RecoverySource

use of org.opensearch.cluster.routing.RecoverySource in project OpenSearch by opensearch-project.

the class IndexMetadataUpdater method updateInSyncAllocations.

/**
 * Updates in-sync allocations with routing changes that were made to the routing table.
 */
private IndexMetadata.Builder updateInSyncAllocations(RoutingTable newRoutingTable, IndexMetadata oldIndexMetadata, IndexMetadata.Builder indexMetadataBuilder, ShardId shardId, Updates updates) {
    assert Sets.haveEmptyIntersection(updates.addedAllocationIds, updates.removedAllocationIds) : "allocation ids cannot be both added and removed in the same allocation round, added ids: " + updates.addedAllocationIds + ", removed ids: " + updates.removedAllocationIds;
    Set<String> oldInSyncAllocationIds = oldIndexMetadata.inSyncAllocationIds(shardId.id());
    // check if we have been force-initializing an empty primary or a stale primary
    if (updates.initializedPrimary != null && oldInSyncAllocationIds.isEmpty() == false && oldInSyncAllocationIds.contains(updates.initializedPrimary.allocationId().getId()) == false) {
        // we're not reusing an existing in-sync allocation id to initialize a primary, which means that we're either force-allocating
        // an empty or a stale primary (see AllocateEmptyPrimaryAllocationCommand or AllocateStalePrimaryAllocationCommand).
        RecoverySource recoverySource = updates.initializedPrimary.recoverySource();
        RecoverySource.Type recoverySourceType = recoverySource.getType();
        boolean emptyPrimary = recoverySourceType == RecoverySource.Type.EMPTY_STORE;
        assert updates.addedAllocationIds.isEmpty() : (emptyPrimary ? "empty" : "stale") + " primary is not force-initialized in same allocation round where shards are started";
        if (indexMetadataBuilder == null) {
            indexMetadataBuilder = IndexMetadata.builder(oldIndexMetadata);
        }
        if (emptyPrimary) {
            // forcing an empty primary resets the in-sync allocations to the empty set (ShardRouting.allocatedPostIndexCreate)
            indexMetadataBuilder.putInSyncAllocationIds(shardId.id(), Collections.emptySet());
        } else {
            final String allocationId;
            if (recoverySource == RecoverySource.ExistingStoreRecoverySource.FORCE_STALE_PRIMARY_INSTANCE) {
                allocationId = RecoverySource.ExistingStoreRecoverySource.FORCED_ALLOCATION_ID;
            } else {
                assert recoverySource instanceof RecoverySource.SnapshotRecoverySource : recoverySource;
                allocationId = updates.initializedPrimary.allocationId().getId();
            }
            // forcing a stale primary resets the in-sync allocations to the singleton set with the stale id
            indexMetadataBuilder.putInSyncAllocationIds(shardId.id(), Collections.singleton(allocationId));
        }
    } else {
        // standard path for updating in-sync ids
        Set<String> inSyncAllocationIds = new HashSet<>(oldInSyncAllocationIds);
        inSyncAllocationIds.addAll(updates.addedAllocationIds);
        inSyncAllocationIds.removeAll(updates.removedAllocationIds);
        assert oldInSyncAllocationIds.contains(RecoverySource.ExistingStoreRecoverySource.FORCED_ALLOCATION_ID) == false || inSyncAllocationIds.contains(RecoverySource.ExistingStoreRecoverySource.FORCED_ALLOCATION_ID) == false : "fake allocation id has to be removed, inSyncAllocationIds:" + inSyncAllocationIds;
        // Prevent set of inSyncAllocationIds to grow unboundedly. This can happen for example if we don't write to a primary
        // but repeatedly shut down nodes that have active replicas.
        // We use number_of_replicas + 1 (= possible active shard copies) to bound the inSyncAllocationIds set
        // Only trim the set of allocation ids when it grows, otherwise we might trim too eagerly when the number
        // of replicas was decreased while shards were unassigned.
        // +1 for the primary
        int maxActiveShards = oldIndexMetadata.getNumberOfReplicas() + 1;
        IndexShardRoutingTable newShardRoutingTable = newRoutingTable.shardRoutingTable(shardId);
        assert newShardRoutingTable.assignedShards().stream().filter(ShardRouting::isRelocationTarget).map(s -> s.allocationId().getId()).noneMatch(inSyncAllocationIds::contains) : newShardRoutingTable.assignedShards() + " vs " + inSyncAllocationIds;
        if (inSyncAllocationIds.size() > oldInSyncAllocationIds.size() && inSyncAllocationIds.size() > maxActiveShards) {
            // trim entries that have no corresponding shard routing in the cluster state (i.e. trim unavailable copies)
            List<ShardRouting> assignedShards = newShardRoutingTable.assignedShards().stream().filter(s -> s.isRelocationTarget() == false).collect(Collectors.toList());
            assert assignedShards.size() <= maxActiveShards : "cannot have more assigned shards " + assignedShards + " than maximum possible active shards " + maxActiveShards;
            Set<String> assignedAllocations = assignedShards.stream().map(s -> s.allocationId().getId()).collect(Collectors.toSet());
            inSyncAllocationIds = inSyncAllocationIds.stream().sorted(// values with routing entries first
            Comparator.comparing(assignedAllocations::contains).reversed()).limit(maxActiveShards).collect(Collectors.toSet());
        }
        // in-sync set, this could create an empty primary on the next allocation.
        if (newShardRoutingTable.activeShards().isEmpty() && updates.firstFailedPrimary != null) {
            // add back allocation id of failed primary
            inSyncAllocationIds.add(updates.firstFailedPrimary.allocationId().getId());
        }
        assert inSyncAllocationIds.isEmpty() == false || oldInSyncAllocationIds.isEmpty() : "in-sync allocations cannot become empty after they have been non-empty: " + oldInSyncAllocationIds;
        // be extra safe here and only update in-sync set if it is non-empty
        if (inSyncAllocationIds.isEmpty() == false) {
            if (indexMetadataBuilder == null) {
                indexMetadataBuilder = IndexMetadata.builder(oldIndexMetadata);
            }
            indexMetadataBuilder.putInSyncAllocationIds(shardId.id(), inSyncAllocationIds);
        }
    }
    return indexMetadataBuilder;
}
Also used : IndexShardRoutingTable(org.opensearch.cluster.routing.IndexShardRoutingTable) RoutingChangesObserver(org.opensearch.cluster.routing.RoutingChangesObserver) Metadata(org.opensearch.cluster.metadata.Metadata) IndexMetadata(org.opensearch.cluster.metadata.IndexMetadata) Index(org.opensearch.index.Index) Set(java.util.Set) HashMap(java.util.HashMap) Collectors(java.util.stream.Collectors) RecoverySource(org.opensearch.cluster.routing.RecoverySource) ShardRouting(org.opensearch.cluster.routing.ShardRouting) ShardId(org.opensearch.index.shard.ShardId) HashSet(java.util.HashSet) Objects(java.util.Objects) ClusterState(org.opensearch.cluster.ClusterState) Sets(org.opensearch.common.util.set.Sets) List(java.util.List) Logger(org.apache.logging.log4j.Logger) Map(java.util.Map) RoutingTable(org.opensearch.cluster.routing.RoutingTable) Comparator(java.util.Comparator) UnassignedInfo(org.opensearch.cluster.routing.UnassignedInfo) Collections(java.util.Collections) IndexShardRoutingTable(org.opensearch.cluster.routing.IndexShardRoutingTable) RecoverySource(org.opensearch.cluster.routing.RecoverySource) ShardRouting(org.opensearch.cluster.routing.ShardRouting) HashSet(java.util.HashSet)

Example 2 with RecoverySource

use of org.opensearch.cluster.routing.RecoverySource in project OpenSearch by opensearch-project.

the class RestoreInProgressAllocationDecider method canAllocate.

@Override
public Decision canAllocate(final ShardRouting shardRouting, final RoutingAllocation allocation) {
    final RecoverySource recoverySource = shardRouting.recoverySource();
    if (recoverySource == null || recoverySource.getType() != RecoverySource.Type.SNAPSHOT) {
        return allocation.decision(Decision.YES, NAME, "ignored as shard is not being recovered from a snapshot");
    }
    final RecoverySource.SnapshotRecoverySource source = (RecoverySource.SnapshotRecoverySource) recoverySource;
    if (source.restoreUUID().equals(RecoverySource.SnapshotRecoverySource.NO_API_RESTORE_UUID)) {
        return allocation.decision(Decision.YES, NAME, "not an API-level restore");
    }
    final RestoreInProgress restoresInProgress = allocation.custom(RestoreInProgress.TYPE);
    if (restoresInProgress != null) {
        RestoreInProgress.Entry restoreInProgress = restoresInProgress.get(source.restoreUUID());
        if (restoreInProgress != null) {
            RestoreInProgress.ShardRestoreStatus shardRestoreStatus = restoreInProgress.shards().get(shardRouting.shardId());
            if (shardRestoreStatus != null && shardRestoreStatus.state().completed() == false) {
                assert shardRestoreStatus.state() != RestoreInProgress.State.SUCCESS : "expected shard [" + shardRouting + "] to be in initializing state but got [" + shardRestoreStatus.state() + "]";
                return allocation.decision(Decision.YES, NAME, "shard is currently being restored");
            }
        }
    }
    return allocation.decision(Decision.NO, NAME, "shard has failed to be restored from the snapshot [%s] because of [%s] - " + "manually close or delete the index [%s] in order to retry to restore the snapshot again or use the reroute API to force the " + "allocation of an empty primary shard", source.snapshot(), shardRouting.unassignedInfo().getDetails(), shardRouting.getIndexName());
}
Also used : RestoreInProgress(org.opensearch.cluster.RestoreInProgress) RecoverySource(org.opensearch.cluster.routing.RecoverySource)

Example 3 with RecoverySource

use of org.opensearch.cluster.routing.RecoverySource in project OpenSearch by opensearch-project.

the class RestoreInProgressAllocationDeciderTests method testCanAllocatePrimaryExistingInRestoreInProgress.

public void testCanAllocatePrimaryExistingInRestoreInProgress() {
    RecoverySource.SnapshotRecoverySource recoverySource = createSnapshotRecoverySource("_existing");
    ClusterState clusterState = createInitialClusterState();
    RoutingTable routingTable = RoutingTable.builder(clusterState.getRoutingTable()).addAsRestore(clusterState.getMetadata().index("test"), recoverySource).build();
    clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build();
    ShardRouting primary = clusterState.getRoutingTable().shardRoutingTable("test", 0).primaryShard();
    assertEquals(ShardRoutingState.UNASSIGNED, primary.state());
    assertEquals(RecoverySource.Type.SNAPSHOT, primary.recoverySource().getType());
    routingTable = clusterState.routingTable();
    final RestoreInProgress.State shardState;
    if (randomBoolean()) {
        shardState = randomFrom(RestoreInProgress.State.STARTED, RestoreInProgress.State.INIT);
    } else {
        shardState = RestoreInProgress.State.FAILURE;
        UnassignedInfo currentInfo = primary.unassignedInfo();
        UnassignedInfo newInfo = new UnassignedInfo(currentInfo.getReason(), currentInfo.getMessage(), new IOException("i/o failure"), currentInfo.getNumFailedAllocations(), currentInfo.getUnassignedTimeInNanos(), currentInfo.getUnassignedTimeInMillis(), currentInfo.isDelayed(), currentInfo.getLastAllocationStatus(), currentInfo.getFailedNodeIds());
        primary = primary.updateUnassigned(newInfo, primary.recoverySource());
        IndexRoutingTable indexRoutingTable = routingTable.index("test");
        IndexRoutingTable.Builder newIndexRoutingTable = IndexRoutingTable.builder(indexRoutingTable.getIndex());
        for (final ObjectCursor<IndexShardRoutingTable> shardEntry : indexRoutingTable.getShards().values()) {
            final IndexShardRoutingTable shardRoutingTable = shardEntry.value;
            for (ShardRouting shardRouting : shardRoutingTable.getShards()) {
                if (shardRouting.primary()) {
                    newIndexRoutingTable.addShard(primary);
                } else {
                    newIndexRoutingTable.addShard(shardRouting);
                }
            }
        }
        routingTable = RoutingTable.builder(routingTable).add(newIndexRoutingTable).build();
    }
    ImmutableOpenMap.Builder<ShardId, RestoreInProgress.ShardRestoreStatus> shards = ImmutableOpenMap.builder();
    shards.put(primary.shardId(), new RestoreInProgress.ShardRestoreStatus(clusterState.getNodes().getLocalNodeId(), shardState));
    Snapshot snapshot = recoverySource.snapshot();
    RestoreInProgress.State restoreState = RestoreInProgress.State.STARTED;
    RestoreInProgress.Entry restore = new RestoreInProgress.Entry(recoverySource.restoreUUID(), snapshot, restoreState, singletonList("test"), shards.build());
    clusterState = ClusterState.builder(clusterState).putCustom(RestoreInProgress.TYPE, new RestoreInProgress.Builder().add(restore).build()).routingTable(routingTable).build();
    Decision decision = executeAllocation(clusterState, primary);
    if (shardState == RestoreInProgress.State.FAILURE) {
        assertEquals(Decision.Type.NO, decision.type());
        assertEquals("shard has failed to be restored from the snapshot [_repository:_existing/_uuid] because of " + "[restore_source[_repository/_existing], failure IOException[i/o failure]] - manually close or delete the index " + "[test] in order to retry to restore the snapshot again or use the reroute API to force the allocation of " + "an empty primary shard", decision.getExplanation());
    } else {
        assertEquals(Decision.Type.YES, decision.type());
        assertEquals("shard is currently being restored", decision.getExplanation());
    }
}
Also used : ClusterState(org.opensearch.cluster.ClusterState) IndexRoutingTable(org.opensearch.cluster.routing.IndexRoutingTable) IndexShardRoutingTable(org.opensearch.cluster.routing.IndexShardRoutingTable) UnassignedInfo(org.opensearch.cluster.routing.UnassignedInfo) IOException(java.io.IOException) ImmutableOpenMap(org.opensearch.common.collect.ImmutableOpenMap) RecoverySource(org.opensearch.cluster.routing.RecoverySource) ShardId(org.opensearch.index.shard.ShardId) Snapshot(org.opensearch.snapshots.Snapshot) RestoreInProgress(org.opensearch.cluster.RestoreInProgress) IndexRoutingTable(org.opensearch.cluster.routing.IndexRoutingTable) IndexShardRoutingTable(org.opensearch.cluster.routing.IndexShardRoutingTable) RoutingTable(org.opensearch.cluster.routing.RoutingTable) ShardRouting(org.opensearch.cluster.routing.ShardRouting)

Example 4 with RecoverySource

use of org.opensearch.cluster.routing.RecoverySource in project OpenSearch by opensearch-project.

the class IndexShardTestCase method recoverShardFromSnapshot.

/**
 * Recover a shard from a snapshot using a given repository *
 */
protected void recoverShardFromSnapshot(final IndexShard shard, final Snapshot snapshot, final Repository repository) {
    final Version version = Version.CURRENT;
    final ShardId shardId = shard.shardId();
    final IndexId indexId = new IndexId(shardId.getIndex().getName(), shardId.getIndex().getUUID());
    final DiscoveryNode node = getFakeDiscoNode(shard.routingEntry().currentNodeId());
    final RecoverySource.SnapshotRecoverySource recoverySource = new RecoverySource.SnapshotRecoverySource(UUIDs.randomBase64UUID(), snapshot, version, indexId);
    final ShardRouting shardRouting = TestShardRouting.newShardRouting(shardId, node.getId(), true, ShardRoutingState.INITIALIZING, recoverySource);
    shard.markAsRecovering("from snapshot", new RecoveryState(shardRouting, node, null));
    final PlainActionFuture<Void> future = PlainActionFuture.newFuture();
    repository.restoreShard(shard.store(), snapshot.getSnapshotId(), indexId, shard.shardId(), shard.recoveryState(), future);
    future.actionGet();
}
Also used : IndexId(org.opensearch.repositories.IndexId) DiscoveryNode(org.opensearch.cluster.node.DiscoveryNode) Version(org.opensearch.Version) TestShardRouting.newShardRouting(org.opensearch.cluster.routing.TestShardRouting.newShardRouting) ShardRouting(org.opensearch.cluster.routing.ShardRouting) TestShardRouting(org.opensearch.cluster.routing.TestShardRouting) RecoveryState(org.opensearch.indices.recovery.RecoveryState) RecoverySource(org.opensearch.cluster.routing.RecoverySource)

Example 5 with RecoverySource

use of org.opensearch.cluster.routing.RecoverySource in project OpenSearch by opensearch-project.

the class IndexRecoveryIT method testReplicaRecovery.

public void testReplicaRecovery() throws Exception {
    final String nodeA = internalCluster().startNode();
    createIndex(INDEX_NAME, Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, SHARD_COUNT).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, REPLICA_COUNT).build());
    ensureGreen(INDEX_NAME);
    final int numOfDocs = scaledRandomIntBetween(0, 200);
    try (BackgroundIndexer indexer = new BackgroundIndexer(INDEX_NAME, "_doc", client(), numOfDocs)) {
        waitForDocs(numOfDocs, indexer);
    }
    refresh(INDEX_NAME);
    assertHitCount(client().prepareSearch(INDEX_NAME).setSize(0).get(), numOfDocs);
    final boolean closedIndex = randomBoolean();
    if (closedIndex) {
        assertAcked(client().admin().indices().prepareClose(INDEX_NAME));
        ensureGreen(INDEX_NAME);
    }
    // force a shard recovery from nodeA to nodeB
    final String nodeB = internalCluster().startNode();
    assertAcked(client().admin().indices().prepareUpdateSettings(INDEX_NAME).setSettings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1)));
    ensureGreen(INDEX_NAME);
    final RecoveryResponse response = client().admin().indices().prepareRecoveries(INDEX_NAME).execute().actionGet();
    // we should now have two total shards, one primary and one replica
    List<RecoveryState> recoveryStates = response.shardRecoveryStates().get(INDEX_NAME);
    assertThat(recoveryStates.size(), equalTo(2));
    List<RecoveryState> nodeAResponses = findRecoveriesForTargetNode(nodeA, recoveryStates);
    assertThat(nodeAResponses.size(), equalTo(1));
    List<RecoveryState> nodeBResponses = findRecoveriesForTargetNode(nodeB, recoveryStates);
    assertThat(nodeBResponses.size(), equalTo(1));
    // validate node A recovery
    final RecoveryState nodeARecoveryState = nodeAResponses.get(0);
    final RecoverySource expectedRecoverySource;
    if (closedIndex == false) {
        expectedRecoverySource = RecoverySource.EmptyStoreRecoverySource.INSTANCE;
    } else {
        expectedRecoverySource = RecoverySource.ExistingStoreRecoverySource.INSTANCE;
    }
    assertRecoveryState(nodeARecoveryState, 0, expectedRecoverySource, true, Stage.DONE, null, nodeA);
    validateIndexRecoveryState(nodeARecoveryState.getIndex());
    // validate node B recovery
    final RecoveryState nodeBRecoveryState = nodeBResponses.get(0);
    assertRecoveryState(nodeBRecoveryState, 0, PeerRecoverySource.INSTANCE, false, Stage.DONE, nodeA, nodeB);
    validateIndexRecoveryState(nodeBRecoveryState.getIndex());
    internalCluster().stopRandomNode(InternalTestCluster.nameFilter(nodeA));
    if (closedIndex) {
        assertAcked(client().admin().indices().prepareOpen(INDEX_NAME));
    }
    assertHitCount(client().prepareSearch(INDEX_NAME).setSize(0).get(), numOfDocs);
}
Also used : BackgroundIndexer(org.opensearch.test.BackgroundIndexer) RecoveryResponse(org.opensearch.action.admin.indices.recovery.RecoveryResponse) SnapshotRecoverySource(org.opensearch.cluster.routing.RecoverySource.SnapshotRecoverySource) RecoverySource(org.opensearch.cluster.routing.RecoverySource) PeerRecoverySource(org.opensearch.cluster.routing.RecoverySource.PeerRecoverySource)

Aggregations

RecoverySource (org.opensearch.cluster.routing.RecoverySource)9 ShardRouting (org.opensearch.cluster.routing.ShardRouting)5 RestoreInProgress (org.opensearch.cluster.RestoreInProgress)4 RoutingTable (org.opensearch.cluster.routing.RoutingTable)4 ShardId (org.opensearch.index.shard.ShardId)4 IndexMetadata (org.opensearch.cluster.metadata.IndexMetadata)3 Metadata (org.opensearch.cluster.metadata.Metadata)3 SnapshotRecoverySource (org.opensearch.cluster.routing.RecoverySource.SnapshotRecoverySource)3 ImmutableOpenMap (org.opensearch.common.collect.ImmutableOpenMap)3 Index (org.opensearch.index.Index)3 IndexId (org.opensearch.repositories.IndexId)3 IntHashSet (com.carrotsearch.hppc.IntHashSet)2 IOException (java.io.IOException)2 Collections (java.util.Collections)2 HashMap (java.util.HashMap)2 HashSet (java.util.HashSet)2 List (java.util.List)2 Map (java.util.Map)2 Set (java.util.Set)2 Collectors (java.util.stream.Collectors)2