use of org.opensearch.cluster.routing.allocation.command.AllocateStalePrimaryAllocationCommand in project OpenSearch by opensearch-project.
the class AllocationIdIT method testFailedRecoveryOnAllocateStalePrimaryRequiresAnotherAllocateStalePrimary.
public void testFailedRecoveryOnAllocateStalePrimaryRequiresAnotherAllocateStalePrimary() throws Exception {
/*
* Allocation id is put on start of shard while historyUUID is adjusted after recovery is done.
*
* If during execution of AllocateStalePrimary a proper allocation id is stored in allocation id set and recovery is failed
* shard restart skips the stage where historyUUID is changed.
*
* That leads to situation where allocated stale primary and its replica belongs to the same historyUUID and
* replica will receive operations after local checkpoint while documents before checkpoints could be significant different.
*
* Therefore, on AllocateStalePrimary we put some fake allocation id (no real one could be generated like that)
* and any failure during recovery requires extra AllocateStalePrimary command to be executed.
*/
// initial set up
final String indexName = "index42";
final String master = internalCluster().startMasterOnlyNode();
String node1 = internalCluster().startNode();
createIndex(indexName, Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1).put(IndexSettings.INDEX_CHECK_ON_STARTUP.getKey(), "checksum").build());
final int numDocs = indexDocs(indexName, "foo", "bar");
final IndexSettings indexSettings = getIndexSettings(indexName, node1);
final Set<String> allocationIds = getAllocationIds(indexName);
final ShardId shardId = new ShardId(resolveIndex(indexName), 0);
final Path indexPath = getIndexPath(node1, shardId);
assertThat(allocationIds, hasSize(1));
final String historyUUID = historyUUID(node1, indexName);
String node2 = internalCluster().startNode();
ensureGreen(indexName);
internalCluster().assertSameDocIdsOnShards();
// initial set up is done
Settings node1DataPathSettings = internalCluster().dataPathSettings(node1);
Settings node2DataPathSettings = internalCluster().dataPathSettings(node2);
internalCluster().stopRandomNode(InternalTestCluster.nameFilter(node1));
// index more docs to node2 that marks node1 as stale
int numExtraDocs = indexDocs(indexName, "foo", "bar2");
assertHitCount(client(node2).prepareSearch(indexName).setQuery(matchAllQuery()).get(), numDocs + numExtraDocs);
internalCluster().stopRandomNode(InternalTestCluster.nameFilter(node2));
// create fake corrupted marker on node1
putFakeCorruptionMarker(indexSettings, shardId, indexPath);
// thanks to master node1 is out of sync
node1 = internalCluster().startNode(node1DataPathSettings);
// there is only _stale_ primary
checkNoValidShardCopy(indexName, shardId);
// allocate stale primary
client(node1).admin().cluster().prepareReroute().add(new AllocateStalePrimaryAllocationCommand(indexName, 0, node1, true)).get();
// allocation fails due to corruption marker
assertBusy(() -> {
final ClusterState state = client().admin().cluster().prepareState().get().getState();
final ShardRouting shardRouting = state.routingTable().index(indexName).shard(shardId.id()).primaryShard();
assertThat(shardRouting.state(), equalTo(ShardRoutingState.UNASSIGNED));
assertThat(shardRouting.unassignedInfo().getReason(), equalTo(UnassignedInfo.Reason.ALLOCATION_FAILED));
});
internalCluster().stopRandomNode(InternalTestCluster.nameFilter(node1));
try (Store store = new Store(shardId, indexSettings, new NIOFSDirectory(indexPath), new DummyShardLock(shardId))) {
store.removeCorruptionMarker();
}
node1 = internalCluster().startNode(node1DataPathSettings);
// index is red: no any shard is allocated (allocation id is a fake id that does not match to anything)
checkHealthStatus(indexName, ClusterHealthStatus.RED);
checkNoValidShardCopy(indexName, shardId);
// no any valid shard is there; have to invoke AllocateStalePrimary again
client().admin().cluster().prepareReroute().add(new AllocateStalePrimaryAllocationCommand(indexName, 0, node1, true)).get();
ensureYellow(indexName);
// bring node2 back
node2 = internalCluster().startNode(node2DataPathSettings);
ensureGreen(indexName);
assertThat(historyUUID(node1, indexName), not(equalTo(historyUUID)));
assertThat(historyUUID(node1, indexName), equalTo(historyUUID(node2, indexName)));
internalCluster().assertSameDocIdsOnShards();
}
use of org.opensearch.cluster.routing.allocation.command.AllocateStalePrimaryAllocationCommand in project OpenSearch by opensearch-project.
the class PrimaryAllocationIT method testForceStaleReplicaToBePromotedForMissingIndex.
public void testForceStaleReplicaToBePromotedForMissingIndex() {
internalCluster().startMasterOnlyNode(Settings.EMPTY);
final String dataNode = internalCluster().startDataOnlyNode();
final String idxName = "test";
IndexNotFoundException ex = expectThrows(IndexNotFoundException.class, () -> client().admin().cluster().prepareReroute().add(new AllocateStalePrimaryAllocationCommand(idxName, 0, dataNode, true)).get());
assertThat(ex.getIndex().getName(), equalTo(idxName));
}
use of org.opensearch.cluster.routing.allocation.command.AllocateStalePrimaryAllocationCommand in project OpenSearch by opensearch-project.
the class AllocationCommandsTests method testAllocateStalePrimaryCommand.
public void testAllocateStalePrimaryCommand() {
AllocationService allocation = createAllocationService(Settings.builder().put(EnableAllocationDecider.CLUSTER_ROUTING_ALLOCATION_ENABLE_SETTING.getKey(), "none").put(EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), "none").build());
final String index = "test";
logger.info("--> building initial routing table");
Metadata metadata = Metadata.builder().put(IndexMetadata.builder(index).settings(settings(Version.CURRENT)).numberOfShards(1).numberOfReplicas(1).putInSyncAllocationIds(0, Collections.singleton("asdf")).putInSyncAllocationIds(1, Collections.singleton("qwertz"))).build();
// shard routing is added as "from recovery" instead of "new index creation" so that we can test below that allocating an empty
// primary with accept_data_loss flag set to false fails
RoutingTable routingTable = RoutingTable.builder().addAsRecovery(metadata.index(index)).build();
ClusterState clusterState = ClusterState.builder(org.opensearch.cluster.ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)).metadata(metadata).routingTable(routingTable).build();
final String node1 = "node1";
final String node2 = "node2";
clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(newNode(node1)).add(newNode(node2))).build();
clusterState = allocation.reroute(clusterState, "reroute");
// mark all shards as stale
final List<ShardRouting> shardRoutings = clusterState.getRoutingNodes().shardsWithState(UNASSIGNED);
assertThat(shardRoutings, hasSize(2));
logger.info("--> allocating empty primary with acceptDataLoss flag set to true");
clusterState = allocation.reroute(clusterState, new AllocationCommands(new AllocateStalePrimaryAllocationCommand(index, 0, node1, true)), false, false).getClusterState();
RoutingNode routingNode1 = clusterState.getRoutingNodes().node(node1);
assertThat(routingNode1.size(), equalTo(1));
assertThat(routingNode1.shardsWithState(INITIALIZING).size(), equalTo(1));
Set<String> inSyncAllocationIds = clusterState.metadata().index(index).inSyncAllocationIds(0);
assertThat(inSyncAllocationIds, equalTo(Collections.singleton(RecoverySource.ExistingStoreRecoverySource.FORCED_ALLOCATION_ID)));
clusterState = startInitializingShardsAndReroute(allocation, clusterState);
routingNode1 = clusterState.getRoutingNodes().node(node1);
assertThat(routingNode1.size(), equalTo(1));
assertThat(routingNode1.shardsWithState(STARTED).size(), equalTo(1));
inSyncAllocationIds = clusterState.metadata().index(index).inSyncAllocationIds(0);
assertThat(inSyncAllocationIds, hasSize(1));
assertThat(inSyncAllocationIds, not(Collections.singleton(RecoverySource.ExistingStoreRecoverySource.FORCED_ALLOCATION_ID)));
}
use of org.opensearch.cluster.routing.allocation.command.AllocateStalePrimaryAllocationCommand in project OpenSearch by opensearch-project.
the class AllocationCommandsTests method testSerialization.
public void testSerialization() throws Exception {
AllocationCommands commands = new AllocationCommands(new AllocateEmptyPrimaryAllocationCommand("test", 1, "node1", true), new AllocateStalePrimaryAllocationCommand("test", 2, "node1", true), new AllocateReplicaAllocationCommand("test", 2, "node1"), new MoveAllocationCommand("test", 3, "node2", "node3"), new CancelAllocationCommand("test", 4, "node5", true));
BytesStreamOutput bytes = new BytesStreamOutput();
AllocationCommands.writeTo(commands, bytes);
StreamInput in = bytes.bytes().streamInput();
// Since the commands are named writeable we need to register them and wrap the input stream
NamedWriteableRegistry namedWriteableRegistry = new NamedWriteableRegistry(NetworkModule.getNamedWriteables());
in = new NamedWriteableAwareStreamInput(in, namedWriteableRegistry);
// Now we can read them!
AllocationCommands sCommands = AllocationCommands.readFrom(in);
assertThat(sCommands.commands().size(), equalTo(5));
assertThat(((AllocateEmptyPrimaryAllocationCommand) (sCommands.commands().get(0))).shardId(), equalTo(1));
assertThat(((AllocateEmptyPrimaryAllocationCommand) (sCommands.commands().get(0))).index(), equalTo("test"));
assertThat(((AllocateEmptyPrimaryAllocationCommand) (sCommands.commands().get(0))).node(), equalTo("node1"));
assertThat(((AllocateEmptyPrimaryAllocationCommand) (sCommands.commands().get(0))).acceptDataLoss(), equalTo(true));
assertThat(((AllocateStalePrimaryAllocationCommand) (sCommands.commands().get(1))).shardId(), equalTo(2));
assertThat(((AllocateStalePrimaryAllocationCommand) (sCommands.commands().get(1))).index(), equalTo("test"));
assertThat(((AllocateStalePrimaryAllocationCommand) (sCommands.commands().get(1))).node(), equalTo("node1"));
assertThat(((AllocateStalePrimaryAllocationCommand) (sCommands.commands().get(1))).acceptDataLoss(), equalTo(true));
assertThat(((AllocateReplicaAllocationCommand) (sCommands.commands().get(2))).shardId(), equalTo(2));
assertThat(((AllocateReplicaAllocationCommand) (sCommands.commands().get(2))).index(), equalTo("test"));
assertThat(((AllocateReplicaAllocationCommand) (sCommands.commands().get(2))).node(), equalTo("node1"));
assertThat(((MoveAllocationCommand) (sCommands.commands().get(3))).shardId(), equalTo(3));
assertThat(((MoveAllocationCommand) (sCommands.commands().get(3))).index(), equalTo("test"));
assertThat(((MoveAllocationCommand) (sCommands.commands().get(3))).fromNode(), equalTo("node2"));
assertThat(((MoveAllocationCommand) (sCommands.commands().get(3))).toNode(), equalTo("node3"));
assertThat(((CancelAllocationCommand) (sCommands.commands().get(4))).shardId(), equalTo(4));
assertThat(((CancelAllocationCommand) (sCommands.commands().get(4))).index(), equalTo("test"));
assertThat(((CancelAllocationCommand) (sCommands.commands().get(4))).node(), equalTo("node5"));
assertThat(((CancelAllocationCommand) (sCommands.commands().get(4))).allowPrimary(), equalTo(true));
}
use of org.opensearch.cluster.routing.allocation.command.AllocateStalePrimaryAllocationCommand in project OpenSearch by opensearch-project.
the class RemoveCorruptedShardDataCommandIT method testCorruptTranslogTruncation.
public void testCorruptTranslogTruncation() throws Exception {
internalCluster().startNodes(2);
final String node1 = internalCluster().getNodeNames()[0];
final String node2 = internalCluster().getNodeNames()[1];
final String indexName = "test";
assertAcked(prepareCreate(indexName).setSettings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1).put(IndexSettings.INDEX_REFRESH_INTERVAL_SETTING.getKey(), "-1").put(MockEngineSupport.DISABLE_FLUSH_ON_CLOSE.getKey(), // never flush - always recover from translog
true).put("index.routing.allocation.exclude._name", node2)));
ensureYellow();
assertAcked(client().admin().indices().prepareUpdateSettings(indexName).setSettings(Settings.builder().putNull("index.routing.allocation.exclude._name")));
ensureGreen();
// Index some documents
int numDocsToKeep = randomIntBetween(10, 100);
logger.info("--> indexing [{}] docs to be kept", numDocsToKeep);
IndexRequestBuilder[] builders = new IndexRequestBuilder[numDocsToKeep];
for (int i = 0; i < builders.length; i++) {
builders[i] = client().prepareIndex(indexName).setSource("foo", "bar");
}
indexRandom(false, false, false, Arrays.asList(builders));
flush(indexName);
disableTranslogFlush(indexName);
// having no extra docs is an interesting case for seq no based recoveries - test it more often
int numDocsToTruncate = randomBoolean() ? 0 : randomIntBetween(0, 100);
logger.info("--> indexing [{}] more doc to be truncated", numDocsToTruncate);
builders = new IndexRequestBuilder[numDocsToTruncate];
for (int i = 0; i < builders.length; i++) {
builders[i] = client().prepareIndex(indexName).setSource("foo", "bar");
}
indexRandom(false, false, false, Arrays.asList(builders));
RemoveCorruptedShardDataCommand command = new RemoveCorruptedShardDataCommand();
MockTerminal terminal = new MockTerminal();
OptionParser parser = command.getParser();
if (randomBoolean() && numDocsToTruncate > 0) {
// flush the replica, so it will have more docs than what the primary will have
Index index = resolveIndex(indexName);
IndexShard replica = internalCluster().getInstance(IndicesService.class, node2).getShardOrNull(new ShardId(index, 0));
replica.flush(new FlushRequest());
logger.info("--> performed extra flushing on replica");
}
final Settings node1PathSettings = internalCluster().dataPathSettings(node1);
final Settings node2PathSettings = internalCluster().dataPathSettings(node2);
// shut down the replica node to be tested later
internalCluster().stopRandomNode(InternalTestCluster.nameFilter(node2));
final Path translogDir = getPathToShardData(indexName, ShardPath.TRANSLOG_FOLDER_NAME);
final Path indexDir = getPathToShardData(indexName, ShardPath.INDEX_FOLDER_NAME);
// Restart the single node
logger.info("--> restarting node");
internalCluster().restartRandomDataNode(new InternalTestCluster.RestartCallback() {
@Override
public Settings onNodeStopped(String nodeName) throws Exception {
logger.info("--> corrupting translog on node {}", nodeName);
TestTranslog.corruptRandomTranslogFile(logger, random(), translogDir);
return super.onNodeStopped(nodeName);
}
});
// all shards should be failed due to a corrupted translog
assertBusy(() -> {
final UnassignedInfo unassignedInfo = client().admin().cluster().prepareAllocationExplain().setIndex(indexName).setShard(0).setPrimary(true).get().getExplanation().getUnassignedInfo();
assertThat(unassignedInfo.getReason(), equalTo(UnassignedInfo.Reason.ALLOCATION_FAILED));
assertThat(ExceptionsHelper.unwrap(unassignedInfo.getFailure(), TranslogCorruptedException.class), not(nullValue()));
});
// have to shut down primary node - otherwise node lock is present
internalCluster().restartNode(node1, new InternalTestCluster.RestartCallback() {
@Override
public Settings onNodeStopped(String nodeName) throws Exception {
assertBusy(() -> {
logger.info("--> checking that lock has been released for {}", indexDir);
// noinspection EmptyTryBlock since we're just trying to obtain the lock
try (Directory dir = FSDirectory.open(indexDir, NativeFSLockFactory.INSTANCE);
Lock ignored = dir.obtainLock(IndexWriter.WRITE_LOCK_NAME)) {
} catch (LockObtainFailedException lofe) {
logger.info("--> failed acquiring lock for {}", indexDir);
throw new AssertionError("still waiting for lock release at [" + indexDir + "]", lofe);
} catch (IOException ioe) {
throw new AssertionError("unexpected IOException [" + indexDir + "]", ioe);
}
});
final Environment environment = TestEnvironment.newEnvironment(Settings.builder().put(internalCluster().getDefaultSettings()).put(node1PathSettings).build());
terminal.addTextInput("y");
OptionSet options = parser.parse("-d", translogDir.toAbsolutePath().toString());
logger.info("--> running command for [{}]", translogDir.toAbsolutePath());
command.execute(terminal, options, environment);
logger.info("--> output:\n{}", terminal.getOutput());
return super.onNodeStopped(nodeName);
}
});
String primaryNodeId = null;
final ClusterState state = client().admin().cluster().prepareState().get().getState();
final DiscoveryNodes nodes = state.nodes();
for (ObjectObjectCursor<String, DiscoveryNode> cursor : nodes.getNodes()) {
final String name = cursor.value.getName();
if (name.equals(node1)) {
primaryNodeId = cursor.key;
break;
}
}
assertThat(primaryNodeId, notNullValue());
assertThat(terminal.getOutput(), containsString("allocate_stale_primary"));
assertThat(terminal.getOutput(), containsString("\"node\" : \"" + primaryNodeId + "\""));
// there is only _stale_ primary (due to new allocation id)
assertBusy(() -> {
final ClusterAllocationExplanation explanation = client().admin().cluster().prepareAllocationExplain().setIndex(indexName).setShard(0).setPrimary(true).get().getExplanation();
final ShardAllocationDecision shardAllocationDecision = explanation.getShardAllocationDecision();
assertThat(shardAllocationDecision.isDecisionTaken(), equalTo(true));
assertThat(shardAllocationDecision.getAllocateDecision().getAllocationDecision(), equalTo(AllocationDecision.NO_VALID_SHARD_COPY));
});
client().admin().cluster().prepareReroute().add(new AllocateStalePrimaryAllocationCommand(indexName, 0, primaryNodeId, true)).get();
assertBusy(() -> {
final ClusterAllocationExplanation explanation = client().admin().cluster().prepareAllocationExplain().setIndex(indexName).setShard(0).setPrimary(true).get().getExplanation();
assertThat(explanation.getCurrentNode(), notNullValue());
assertThat(explanation.getShardState(), equalTo(ShardRoutingState.STARTED));
});
ensureYellow(indexName);
// Run a search and make sure it succeeds
assertHitCount(client().prepareSearch(indexName).setQuery(matchAllQuery()).get(), numDocsToKeep);
logger.info("--> starting the replica node to test recovery");
internalCluster().startNode(node2PathSettings);
ensureGreen(indexName);
for (String node : internalCluster().nodesInclude(indexName)) {
SearchRequestBuilder q = client().prepareSearch(indexName).setPreference("_only_nodes:" + node).setQuery(matchAllQuery());
assertHitCount(q.get(), numDocsToKeep);
}
final RecoveryResponse recoveryResponse = client().admin().indices().prepareRecoveries(indexName).setActiveOnly(false).get();
final RecoveryState replicaRecoveryState = recoveryResponse.shardRecoveryStates().get(indexName).stream().filter(recoveryState -> recoveryState.getPrimary() == false).findFirst().get();
assertThat(replicaRecoveryState.getIndex().toString(), replicaRecoveryState.getIndex().recoveredFileCount(), greaterThan(0));
// Ensure that the global checkpoint and local checkpoint are restored from the max seqno of the last commit.
final SeqNoStats seqNoStats = getSeqNoStats(indexName, 0);
assertThat(seqNoStats.getGlobalCheckpoint(), equalTo(seqNoStats.getMaxSeqNo()));
assertThat(seqNoStats.getLocalCheckpoint(), equalTo(seqNoStats.getMaxSeqNo()));
}
Aggregations