use of org.opensearch.test.transport.MockTransportService in project OpenSearch by opensearch-project.
the class IndexRecoveryIT method testTransientErrorsDuringRecoveryAreRetried.
public void testTransientErrorsDuringRecoveryAreRetried() throws Exception {
final String indexName = "test";
final Settings nodeSettings = Settings.builder().put(RecoverySettings.INDICES_RECOVERY_RETRY_DELAY_NETWORK_SETTING.getKey(), "100ms").put(NodeConnectionsService.CLUSTER_NODE_RECONNECT_INTERVAL_SETTING.getKey(), "500ms").put(RecoverySettings.INDICES_RECOVERY_INTERNAL_ACTION_TIMEOUT_SETTING.getKey(), "10s").build();
// start a master node
internalCluster().startNode(nodeSettings);
final String blueNodeName = internalCluster().startNode(Settings.builder().put("node.attr.color", "blue").put(nodeSettings).build());
final String redNodeName = internalCluster().startNode(Settings.builder().put("node.attr.color", "red").put(nodeSettings).build());
ClusterHealthResponse response = client().admin().cluster().prepareHealth().setWaitForNodes(">=3").get();
assertThat(response.isTimedOut(), is(false));
client().admin().indices().prepareCreate(indexName).setSettings(Settings.builder().put(IndexMetadata.INDEX_ROUTING_INCLUDE_GROUP_SETTING.getKey() + "color", "blue").put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1).put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 0)).get();
List<IndexRequestBuilder> requests = new ArrayList<>();
int numDocs = scaledRandomIntBetween(100, 8000);
// Index 3/4 of the documents and flush. And then index the rest. This attempts to ensure that there
// is a mix of file chunks and translog ops
int threeFourths = (int) (numDocs * 0.75);
for (int i = 0; i < threeFourths; i++) {
requests.add(client().prepareIndex(indexName).setSource("{}", XContentType.JSON));
}
indexRandom(true, requests);
flush(indexName);
requests.clear();
for (int i = threeFourths; i < numDocs; i++) {
requests.add(client().prepareIndex(indexName).setSource("{}", XContentType.JSON));
}
indexRandom(true, requests);
ensureSearchable(indexName);
ClusterStateResponse stateResponse = client().admin().cluster().prepareState().get();
final String blueNodeId = internalCluster().getInstance(ClusterService.class, blueNodeName).localNode().getId();
assertFalse(stateResponse.getState().getRoutingNodes().node(blueNodeId).isEmpty());
SearchResponse searchResponse = client().prepareSearch(indexName).get();
assertHitCount(searchResponse, numDocs);
String[] recoveryActions = new String[] { PeerRecoveryTargetService.Actions.PREPARE_TRANSLOG, PeerRecoveryTargetService.Actions.TRANSLOG_OPS, PeerRecoveryTargetService.Actions.FILES_INFO, PeerRecoveryTargetService.Actions.FILE_CHUNK, PeerRecoveryTargetService.Actions.CLEAN_FILES, PeerRecoveryTargetService.Actions.FINALIZE };
final String recoveryActionToBlock = randomFrom(recoveryActions);
logger.info("--> will temporarily interrupt recovery action between blue & red on [{}]", recoveryActionToBlock);
MockTransportService blueTransportService = (MockTransportService) internalCluster().getInstance(TransportService.class, blueNodeName);
MockTransportService redTransportService = (MockTransportService) internalCluster().getInstance(TransportService.class, redNodeName);
final AtomicBoolean recoveryStarted = new AtomicBoolean(false);
final AtomicBoolean finalizeReceived = new AtomicBoolean(false);
final SingleStartEnforcer validator = new SingleStartEnforcer(indexName, recoveryStarted, finalizeReceived);
redTransportService.addSendBehavior(blueTransportService, (connection, requestId, action, request, options) -> {
validator.accept(action, request);
connection.sendRequest(requestId, action, request, options);
});
Runnable connectionBreaker = () -> {
// Always break connection from source to remote to ensure that actions are retried
logger.info("--> closing connections from source node to target node");
blueTransportService.disconnectFromNode(redTransportService.getLocalDiscoNode());
if (randomBoolean()) {
// Sometimes break connection from remote to source to ensure that recovery is re-established
logger.info("--> closing connections from target node to source node");
redTransportService.disconnectFromNode(blueTransportService.getLocalDiscoNode());
}
};
TransientReceiveRejected handlingBehavior = new TransientReceiveRejected(recoveryActionToBlock, finalizeReceived, recoveryStarted, connectionBreaker);
redTransportService.addRequestHandlingBehavior(recoveryActionToBlock, handlingBehavior);
try {
logger.info("--> starting recovery from blue to red");
client().admin().indices().prepareUpdateSettings(indexName).setSettings(Settings.builder().put(IndexMetadata.INDEX_ROUTING_INCLUDE_GROUP_SETTING.getKey() + "color", "red,blue").put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, 1)).get();
ensureGreen();
searchResponse = client(redNodeName).prepareSearch(indexName).setPreference("_local").get();
assertHitCount(searchResponse, numDocs);
} finally {
blueTransportService.clearAllRules();
redTransportService.clearAllRules();
}
}
use of org.opensearch.test.transport.MockTransportService in project OpenSearch by opensearch-project.
the class IndexRecoveryIT method testRecoverLocallyUpToGlobalCheckpoint.
public void testRecoverLocallyUpToGlobalCheckpoint() throws Exception {
internalCluster().ensureAtLeastNumDataNodes(2);
List<String> nodes = randomSubsetOf(2, StreamSupport.stream(clusterService().state().nodes().getDataNodes().spliterator(), false).map(node -> node.value.getName()).collect(Collectors.toSet()));
String indexName = "test-index";
createIndex(indexName, Settings.builder().put("index.number_of_shards", 1).put("index.number_of_replicas", 1).put(IndexService.GLOBAL_CHECKPOINT_SYNC_INTERVAL_SETTING.getKey(), "12h").put("index.routing.allocation.include._name", String.join(",", nodes)).build());
ensureGreen(indexName);
int numDocs = randomIntBetween(0, 100);
indexRandom(randomBoolean(), false, randomBoolean(), IntStream.range(0, numDocs).mapToObj(n -> client().prepareIndex(indexName).setSource("num", n)).collect(toList()));
// avoid refresh when we are failing a shard
client().admin().indices().prepareRefresh(indexName).get();
String failingNode = randomFrom(nodes);
PlainActionFuture<StartRecoveryRequest> startRecoveryRequestFuture = new PlainActionFuture<>();
// Peer recovery fails if the primary does not see the recovering replica in the replication group (when the cluster state
// update on the primary is delayed). To verify the local recovery stats, we have to manually remember this value in the
// first try because the local recovery happens once and its stats is reset when the recovery fails.
SetOnce<Integer> localRecoveredOps = new SetOnce<>();
for (String node : nodes) {
MockTransportService transportService = (MockTransportService) internalCluster().getInstance(TransportService.class, node);
transportService.addSendBehavior((connection, requestId, action, request, options) -> {
if (action.equals(PeerRecoverySourceService.Actions.START_RECOVERY)) {
final RecoveryState recoveryState = internalCluster().getInstance(IndicesService.class, failingNode).getShardOrNull(new ShardId(resolveIndex(indexName), 0)).recoveryState();
assertThat(recoveryState.getTranslog().recoveredOperations(), equalTo(recoveryState.getTranslog().totalLocal()));
if (startRecoveryRequestFuture.isDone()) {
assertThat(recoveryState.getTranslog().totalLocal(), equalTo(0));
recoveryState.getTranslog().totalLocal(localRecoveredOps.get());
recoveryState.getTranslog().incrementRecoveredOperations(localRecoveredOps.get());
} else {
localRecoveredOps.set(recoveryState.getTranslog().totalLocal());
startRecoveryRequestFuture.onResponse((StartRecoveryRequest) request);
}
}
if (action.equals(PeerRecoveryTargetService.Actions.FILE_CHUNK)) {
RetentionLeases retentionLeases = internalCluster().getInstance(IndicesService.class, node).indexServiceSafe(resolveIndex(indexName)).getShard(0).getRetentionLeases();
throw new AssertionError("expect an operation-based recovery:" + "retention leases" + Strings.toString(retentionLeases) + "]");
}
connection.sendRequest(requestId, action, request, options);
});
}
IndexShard shard = internalCluster().getInstance(IndicesService.class, failingNode).getShardOrNull(new ShardId(resolveIndex(indexName), 0));
final long lastSyncedGlobalCheckpoint = shard.getLastSyncedGlobalCheckpoint();
final long localCheckpointOfSafeCommit;
try (Engine.IndexCommitRef safeCommitRef = shard.acquireSafeIndexCommit()) {
localCheckpointOfSafeCommit = SequenceNumbers.loadSeqNoInfoFromLuceneCommit(safeCommitRef.getIndexCommit().getUserData().entrySet()).localCheckpoint;
}
final long maxSeqNo = shard.seqNoStats().getMaxSeqNo();
shard.failShard("test", new IOException("simulated"));
StartRecoveryRequest startRecoveryRequest = startRecoveryRequestFuture.actionGet();
logger.info("--> start recovery request: starting seq_no {}, commit {}", startRecoveryRequest.startingSeqNo(), startRecoveryRequest.metadataSnapshot().getCommitUserData());
SequenceNumbers.CommitInfo commitInfoAfterLocalRecovery = SequenceNumbers.loadSeqNoInfoFromLuceneCommit(startRecoveryRequest.metadataSnapshot().getCommitUserData().entrySet());
assertThat(commitInfoAfterLocalRecovery.localCheckpoint, equalTo(lastSyncedGlobalCheckpoint));
assertThat(commitInfoAfterLocalRecovery.maxSeqNo, equalTo(lastSyncedGlobalCheckpoint));
assertThat(startRecoveryRequest.startingSeqNo(), equalTo(lastSyncedGlobalCheckpoint + 1));
ensureGreen(indexName);
assertThat((long) localRecoveredOps.get(), equalTo(lastSyncedGlobalCheckpoint - localCheckpointOfSafeCommit));
for (RecoveryState recoveryState : client().admin().indices().prepareRecoveries().get().shardRecoveryStates().get(indexName)) {
if (startRecoveryRequest.targetNode().equals(recoveryState.getTargetNode())) {
assertThat("expect an operation-based recovery", recoveryState.getIndex().fileDetails(), empty());
assertThat("total recovered translog operations must include both local and remote recovery", recoveryState.getTranslog().recoveredOperations(), greaterThanOrEqualTo(Math.toIntExact(maxSeqNo - localCheckpointOfSafeCommit)));
}
}
for (String node : nodes) {
MockTransportService transportService = (MockTransportService) internalCluster().getInstance(TransportService.class, node);
transportService.clearAllRules();
}
}
use of org.opensearch.test.transport.MockTransportService in project OpenSearch by opensearch-project.
the class CorruptedFileIT method testCorruptionOnNetworkLayerFinalizingRecovery.
/**
* This test triggers a corrupt index exception during finalization size if an empty commit point is transferred
* during recovery we don't know the version of the segments_N file because it has no segments we can take it from.
* This simulates recoveries from old indices or even without checksums and makes sure if we fail during finalization
* we also check if the primary is ok. Without the relevant checks this test fails with a RED cluster
*/
public void testCorruptionOnNetworkLayerFinalizingRecovery() throws ExecutionException, InterruptedException, IOException {
internalCluster().ensureAtLeastNumDataNodes(2);
NodesStatsResponse nodeStats = client().admin().cluster().prepareNodesStats().get();
List<NodeStats> dataNodeStats = new ArrayList<>();
for (NodeStats stat : nodeStats.getNodes()) {
if (stat.getNode().isDataNode()) {
dataNodeStats.add(stat);
}
}
assertThat(dataNodeStats.size(), greaterThanOrEqualTo(2));
Collections.shuffle(dataNodeStats, random());
NodeStats primariesNode = dataNodeStats.get(0);
NodeStats unluckyNode = dataNodeStats.get(1);
assertAcked(prepareCreate("test").setSettings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, "0").put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, 1).put("index.routing.allocation.include._name", primariesNode.getNode().getName()).put(EnableAllocationDecider.INDEX_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), EnableAllocationDecider.Rebalance.NONE).put("index.allocation.max_retries", // keep on retrying
Integer.MAX_VALUE)));
// allocated with empty commit
ensureGreen();
final AtomicBoolean corrupt = new AtomicBoolean(true);
final CountDownLatch hasCorrupted = new CountDownLatch(1);
for (NodeStats dataNode : dataNodeStats) {
MockTransportService mockTransportService = ((MockTransportService) internalCluster().getInstance(TransportService.class, dataNode.getNode().getName()));
mockTransportService.addSendBehavior(internalCluster().getInstance(TransportService.class, unluckyNode.getNode().getName()), (connection, requestId, action, request, options) -> {
if (corrupt.get() && action.equals(PeerRecoveryTargetService.Actions.FILE_CHUNK)) {
RecoveryFileChunkRequest req = (RecoveryFileChunkRequest) request;
byte[] array = BytesRef.deepCopyOf(req.content().toBytesRef()).bytes;
int i = randomIntBetween(0, req.content().length() - 1);
// flip one byte in the content
array[i] = (byte) ~array[i];
hasCorrupted.countDown();
}
connection.sendRequest(requestId, action, request, options);
});
}
Settings build = Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, "1").put("index.routing.allocation.include._name", primariesNode.getNode().getName() + "," + unluckyNode.getNode().getName()).build();
client().admin().indices().prepareUpdateSettings("test").setSettings(build).get();
client().admin().cluster().prepareReroute().get();
hasCorrupted.await();
corrupt.set(false);
ensureGreen();
}
use of org.opensearch.test.transport.MockTransportService in project OpenSearch by opensearch-project.
the class CorruptedFileIT method testCorruptionOnNetworkLayer.
/**
* Tests corruption that happens on the network layer and that the primary does not get affected by corruption that happens on the way
* to the replica. The file on disk stays uncorrupted
*/
public void testCorruptionOnNetworkLayer() throws ExecutionException, InterruptedException {
int numDocs = scaledRandomIntBetween(100, 1000);
internalCluster().ensureAtLeastNumDataNodes(2);
if (cluster().numDataNodes() < 3) {
internalCluster().startDataOnlyNode();
}
NodesStatsResponse nodeStats = client().admin().cluster().prepareNodesStats().get();
List<NodeStats> dataNodeStats = new ArrayList<>();
for (NodeStats stat : nodeStats.getNodes()) {
if (stat.getNode().isDataNode()) {
dataNodeStats.add(stat);
}
}
assertThat(dataNodeStats.size(), greaterThanOrEqualTo(2));
Collections.shuffle(dataNodeStats, random());
NodeStats primariesNode = dataNodeStats.get(0);
NodeStats unluckyNode = dataNodeStats.get(1);
assertAcked(prepareCreate("test").setSettings(Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, "0").put(IndexMetadata.SETTING_NUMBER_OF_SHARDS, // don't go crazy here it must recovery fast
between(1, 4)).put(MockFSIndexStore.INDEX_CHECK_INDEX_ON_CLOSE_SETTING.getKey(), false).put("index.routing.allocation.include._name", primariesNode.getNode().getName()).put(EnableAllocationDecider.INDEX_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), EnableAllocationDecider.Rebalance.NONE)));
ensureGreen();
IndexRequestBuilder[] builders = new IndexRequestBuilder[numDocs];
for (int i = 0; i < builders.length; i++) {
builders[i] = client().prepareIndex("test").setSource("field", "value");
}
indexRandom(true, builders);
ensureGreen();
assertAllSuccessful(client().admin().indices().prepareFlush().setForce(true).execute().actionGet());
// we have to flush at least once here since we don't corrupt the translog
SearchResponse countResponse = client().prepareSearch().setSize(0).get();
assertHitCount(countResponse, numDocs);
final boolean truncate = randomBoolean();
for (NodeStats dataNode : dataNodeStats) {
MockTransportService mockTransportService = ((MockTransportService) internalCluster().getInstance(TransportService.class, dataNode.getNode().getName()));
mockTransportService.addSendBehavior(internalCluster().getInstance(TransportService.class, unluckyNode.getNode().getName()), (connection, requestId, action, request, options) -> {
if (action.equals(PeerRecoveryTargetService.Actions.FILE_CHUNK)) {
RecoveryFileChunkRequest req = (RecoveryFileChunkRequest) request;
if (truncate && req.length() > 1) {
BytesRef bytesRef = req.content().toBytesRef();
BytesArray array = new BytesArray(bytesRef.bytes, bytesRef.offset, (int) req.length() - 1);
request = new RecoveryFileChunkRequest(req.recoveryId(), req.requestSeqNo(), req.shardId(), req.metadata(), req.position(), array, req.lastChunk(), req.totalTranslogOps(), req.sourceThrottleTimeInNanos());
} else {
assert req.content().toBytesRef().bytes == req.content().toBytesRef().bytes : "no internal reference!!";
final byte[] array = req.content().toBytesRef().bytes;
int i = randomIntBetween(0, req.content().length() - 1);
// flip one byte in the content
array[i] = (byte) ~array[i];
}
}
connection.sendRequest(requestId, action, request, options);
});
}
Settings build = Settings.builder().put(IndexMetadata.SETTING_NUMBER_OF_REPLICAS, "1").put("index.routing.allocation.include._name", "*").build();
client().admin().indices().prepareUpdateSettings("test").setSettings(build).get();
client().admin().cluster().prepareReroute().get();
ClusterHealthResponse actionGet = client().admin().cluster().health(Requests.clusterHealthRequest("test").waitForGreenStatus()).actionGet();
if (actionGet.isTimedOut()) {
logger.info("ensureGreen timed out, cluster state:\n{}\n{}", client().admin().cluster().prepareState().get().getState(), client().admin().cluster().preparePendingClusterTasks().get());
assertThat("timed out waiting for green state", actionGet.isTimedOut(), equalTo(false));
}
// we are green so primaries got not corrupted.
// ensure that no shard is actually allocated on the unlucky node
ClusterStateResponse clusterStateResponse = client().admin().cluster().prepareState().get();
for (IndexShardRoutingTable table : clusterStateResponse.getState().getRoutingTable().index("test")) {
for (ShardRouting routing : table) {
if (unluckyNode.getNode().getId().equals(routing.currentNodeId())) {
assertThat(routing.state(), not(equalTo(ShardRoutingState.STARTED)));
assertThat(routing.state(), not(equalTo(ShardRoutingState.RELOCATING)));
}
}
}
final int numIterations = scaledRandomIntBetween(5, 20);
for (int i = 0; i < numIterations; i++) {
SearchResponse response = client().prepareSearch().setSize(numDocs).get();
assertHitCount(response, numDocs);
}
}
use of org.opensearch.test.transport.MockTransportService in project OpenSearch by opensearch-project.
the class CrossClusterSearchUnavailableClusterIT method startTransport.
private static MockTransportService startTransport(final String id, final List<DiscoveryNode> knownNodes, final Version version, final ThreadPool threadPool) {
boolean success = false;
final Settings s = Settings.builder().put("node.name", id).build();
ClusterName clusterName = ClusterName.CLUSTER_NAME_SETTING.get(s);
MockTransportService newService = MockTransportService.createNewService(s, version, threadPool, null);
try {
newService.registerRequestHandler(ClusterSearchShardsAction.NAME, ThreadPool.Names.SAME, ClusterSearchShardsRequest::new, (request, channel, task) -> {
channel.sendResponse(new ClusterSearchShardsResponse(new ClusterSearchShardsGroup[0], knownNodes.toArray(new DiscoveryNode[0]), Collections.emptyMap()));
});
newService.registerRequestHandler(SearchAction.NAME, ThreadPool.Names.SAME, SearchRequest::new, (request, channel, task) -> {
InternalSearchResponse response = new InternalSearchResponse(new SearchHits(new SearchHit[0], new TotalHits(0, TotalHits.Relation.EQUAL_TO), Float.NaN), InternalAggregations.EMPTY, null, null, false, null, 1);
SearchResponse searchResponse = new SearchResponse(response, null, 1, 1, 0, 100, ShardSearchFailure.EMPTY_ARRAY, SearchResponse.Clusters.EMPTY);
channel.sendResponse(searchResponse);
});
newService.registerRequestHandler(ClusterStateAction.NAME, ThreadPool.Names.SAME, ClusterStateRequest::new, (request, channel, task) -> {
DiscoveryNodes.Builder builder = DiscoveryNodes.builder();
for (DiscoveryNode node : knownNodes) {
builder.add(node);
}
ClusterState build = ClusterState.builder(clusterName).nodes(builder.build()).build();
channel.sendResponse(new ClusterStateResponse(clusterName, build, false));
});
newService.start();
newService.acceptIncomingRequests();
success = true;
return newService;
} finally {
if (success == false) {
newService.close();
}
}
}
Aggregations