Search in sources :

Example 1 with RecoveryFileChunkRequest

use of org.elasticsearch.indices.recovery.RecoveryFileChunkRequest in project elasticsearch by elastic.

the class TruncatedRecoveryIT method testCancelRecoveryAndResume.

/**
     * This test tries to truncate some of larger files in the index to trigger leftovers on the recovery
     * target. This happens during recovery when the last chunk of the file is transferred to the replica
     * we just throw an exception to make sure the recovery fails and we leave some half baked files on the target.
     * Later we allow full recovery to ensure we can still recover and don't run into corruptions.
     */
public void testCancelRecoveryAndResume() throws Exception {
    assertTrue(client().admin().cluster().prepareUpdateSettings().setTransientSettings(Settings.builder().put(CHUNK_SIZE_SETTING.getKey(), new ByteSizeValue(randomIntBetween(50, 300), ByteSizeUnit.BYTES))).get().isAcknowledged());
    NodesStatsResponse nodeStats = client().admin().cluster().prepareNodesStats().get();
    List<NodeStats> dataNodeStats = new ArrayList<>();
    for (NodeStats stat : nodeStats.getNodes()) {
        if (stat.getNode().isDataNode()) {
            dataNodeStats.add(stat);
        }
    }
    assertThat(dataNodeStats.size(), greaterThanOrEqualTo(2));
    Collections.shuffle(dataNodeStats, random());
    // we use 2 nodes a lucky and unlucky one
    // the lucky one holds the primary
    // the unlucky one gets the replica and the truncated leftovers
    NodeStats primariesNode = dataNodeStats.get(0);
    NodeStats unluckyNode = dataNodeStats.get(1);
    // create the index and prevent allocation on any other nodes than the lucky one
    // we have no replicas so far and make sure that we allocate the primary on the lucky node
    assertAcked(prepareCreate("test").addMapping("type1", "field1", "type=text", "the_id", "type=text").setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 0).put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, numberOfShards()).put("index.routing.allocation.include._name", // only allocate on the lucky node
    primariesNode.getNode().getName())));
    // index some docs and check if they are coming back
    int numDocs = randomIntBetween(100, 200);
    List<IndexRequestBuilder> builder = new ArrayList<>();
    for (int i = 0; i < numDocs; i++) {
        String id = Integer.toString(i);
        builder.add(client().prepareIndex("test", "type1", id).setSource("field1", English.intToEnglish(i), "the_id", id));
    }
    indexRandom(true, builder);
    for (int i = 0; i < numDocs; i++) {
        String id = Integer.toString(i);
        assertHitCount(client().prepareSearch().setQuery(QueryBuilders.termQuery("the_id", id)).get(), 1);
    }
    ensureGreen();
    // ensure we have flushed segments and make them a big one via optimize
    client().admin().indices().prepareFlush().setForce(true).get();
    client().admin().indices().prepareForceMerge().setMaxNumSegments(1).setFlush(true).get();
    final CountDownLatch latch = new CountDownLatch(1);
    final AtomicBoolean truncate = new AtomicBoolean(true);
    for (NodeStats dataNode : dataNodeStats) {
        MockTransportService mockTransportService = ((MockTransportService) internalCluster().getInstance(TransportService.class, dataNode.getNode().getName()));
        mockTransportService.addDelegate(internalCluster().getInstance(TransportService.class, unluckyNode.getNode().getName()), new MockTransportService.DelegateTransport(mockTransportService.original()) {

            @Override
            protected void sendRequest(Connection connection, long requestId, String action, TransportRequest request, TransportRequestOptions options) throws IOException {
                if (action.equals(PeerRecoveryTargetService.Actions.FILE_CHUNK)) {
                    RecoveryFileChunkRequest req = (RecoveryFileChunkRequest) request;
                    logger.debug("file chunk [{}] lastChunk: {}", req, req.lastChunk());
                    if ((req.name().endsWith("cfs") || req.name().endsWith("fdt")) && req.lastChunk() && truncate.get()) {
                        latch.countDown();
                        throw new RuntimeException("Caused some truncated files for fun and profit");
                    }
                }
                super.sendRequest(connection, requestId, action, request, options);
            }
        });
    }
    //
    logger.info("--> bumping replicas to 1");
    client().admin().indices().prepareUpdateSettings("test").setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, 1).put(// now allow allocation on all nodes
    "index.routing.allocation.include._name", primariesNode.getNode().getName() + "," + unluckyNode.getNode().getName())).get();
    latch.await();
    // at this point we got some truncated left overs on the replica on the unlucky node
    // now we are allowing the recovery to allocate again and finish to see if we wipe the truncated files
    truncate.compareAndSet(true, false);
    ensureGreen("test");
    for (int i = 0; i < numDocs; i++) {
        String id = Integer.toString(i);
        assertHitCount(client().prepareSearch().setQuery(QueryBuilders.termQuery("the_id", id)).get(), 1);
    }
}
Also used : TransportRequest(org.elasticsearch.transport.TransportRequest) MockTransportService(org.elasticsearch.test.transport.MockTransportService) RecoveryFileChunkRequest(org.elasticsearch.indices.recovery.RecoveryFileChunkRequest) ByteSizeValue(org.elasticsearch.common.unit.ByteSizeValue) ArrayList(java.util.ArrayList) IOException(java.io.IOException) CountDownLatch(java.util.concurrent.CountDownLatch) NodesStatsResponse(org.elasticsearch.action.admin.cluster.node.stats.NodesStatsResponse) IndexRequestBuilder(org.elasticsearch.action.index.IndexRequestBuilder) NodeStats(org.elasticsearch.action.admin.cluster.node.stats.NodeStats) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) MockTransportService(org.elasticsearch.test.transport.MockTransportService) TransportService(org.elasticsearch.transport.TransportService) TransportRequestOptions(org.elasticsearch.transport.TransportRequestOptions)

Example 2 with RecoveryFileChunkRequest

use of org.elasticsearch.indices.recovery.RecoveryFileChunkRequest in project elasticsearch by elastic.

the class CorruptedFileIT method testCorruptionOnNetworkLayerFinalizingRecovery.

/**
     * This test triggers a corrupt index exception during finalization size if an empty commit point is transferred
     * during recovery we don't know the version of the segments_N file because it has no segments we can take it from.
     * This simulates recoveries from old indices or even without checksums and makes sure if we fail during finalization
     * we also check if the primary is ok. Without the relevant checks this test fails with a RED cluster
     */
public void testCorruptionOnNetworkLayerFinalizingRecovery() throws ExecutionException, InterruptedException, IOException {
    internalCluster().ensureAtLeastNumDataNodes(2);
    NodesStatsResponse nodeStats = client().admin().cluster().prepareNodesStats().get();
    List<NodeStats> dataNodeStats = new ArrayList<>();
    for (NodeStats stat : nodeStats.getNodes()) {
        if (stat.getNode().isDataNode()) {
            dataNodeStats.add(stat);
        }
    }
    assertThat(dataNodeStats.size(), greaterThanOrEqualTo(2));
    Collections.shuffle(dataNodeStats, random());
    NodeStats primariesNode = dataNodeStats.get(0);
    NodeStats unluckyNode = dataNodeStats.get(1);
    assertAcked(prepareCreate("test").setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "0").put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, 1).put("index.routing.allocation.include._name", primariesNode.getNode().getName()).put(EnableAllocationDecider.INDEX_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), EnableAllocationDecider.Rebalance.NONE).put("index.allocation.max_retries", // keep on retrying
    Integer.MAX_VALUE)));
    // allocated with empty commit
    ensureGreen();
    final AtomicBoolean corrupt = new AtomicBoolean(true);
    final CountDownLatch hasCorrupted = new CountDownLatch(1);
    for (NodeStats dataNode : dataNodeStats) {
        MockTransportService mockTransportService = ((MockTransportService) internalCluster().getInstance(TransportService.class, dataNode.getNode().getName()));
        mockTransportService.addDelegate(internalCluster().getInstance(TransportService.class, unluckyNode.getNode().getName()), new MockTransportService.DelegateTransport(mockTransportService.original()) {

            @Override
            protected void sendRequest(Connection connection, long requestId, String action, TransportRequest request, TransportRequestOptions options) throws IOException {
                if (corrupt.get() && action.equals(PeerRecoveryTargetService.Actions.FILE_CHUNK)) {
                    RecoveryFileChunkRequest req = (RecoveryFileChunkRequest) request;
                    byte[] array = BytesRef.deepCopyOf(req.content().toBytesRef()).bytes;
                    int i = randomIntBetween(0, req.content().length() - 1);
                    // flip one byte in the content
                    array[i] = (byte) ~array[i];
                    hasCorrupted.countDown();
                }
                super.sendRequest(connection, requestId, action, request, options);
            }
        });
    }
    Settings build = Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "1").put("index.routing.allocation.include._name", primariesNode.getNode().getName() + "," + unluckyNode.getNode().getName()).build();
    client().admin().indices().prepareUpdateSettings("test").setSettings(build).get();
    client().admin().cluster().prepareReroute().get();
    hasCorrupted.await();
    corrupt.set(false);
    ensureGreen();
}
Also used : TransportRequest(org.elasticsearch.transport.TransportRequest) MockTransportService(org.elasticsearch.test.transport.MockTransportService) RecoveryFileChunkRequest(org.elasticsearch.indices.recovery.RecoveryFileChunkRequest) CopyOnWriteArrayList(java.util.concurrent.CopyOnWriteArrayList) ArrayList(java.util.ArrayList) CollectionUtils.iterableAsArrayList(org.elasticsearch.common.util.CollectionUtils.iterableAsArrayList) IOException(java.io.IOException) CountDownLatch(java.util.concurrent.CountDownLatch) NodesStatsResponse(org.elasticsearch.action.admin.cluster.node.stats.NodesStatsResponse) NodeStats(org.elasticsearch.action.admin.cluster.node.stats.NodeStats) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) MockTransportService(org.elasticsearch.test.transport.MockTransportService) TransportService(org.elasticsearch.transport.TransportService) TransportRequestOptions(org.elasticsearch.transport.TransportRequestOptions) Settings(org.elasticsearch.common.settings.Settings) IndexSettings(org.elasticsearch.index.IndexSettings)

Example 3 with RecoveryFileChunkRequest

use of org.elasticsearch.indices.recovery.RecoveryFileChunkRequest in project elasticsearch by elastic.

the class CorruptedFileIT method testCorruptionOnNetworkLayer.

/**
     * Tests corruption that happens on the network layer and that the primary does not get affected by corruption that happens on the way
     * to the replica. The file on disk stays uncorrupted
     */
public void testCorruptionOnNetworkLayer() throws ExecutionException, InterruptedException {
    int numDocs = scaledRandomIntBetween(100, 1000);
    internalCluster().ensureAtLeastNumDataNodes(2);
    if (cluster().numDataNodes() < 3) {
        internalCluster().startNode(Settings.builder().put(Node.NODE_DATA_SETTING.getKey(), true).put(Node.NODE_MASTER_SETTING.getKey(), false));
    }
    NodesStatsResponse nodeStats = client().admin().cluster().prepareNodesStats().get();
    List<NodeStats> dataNodeStats = new ArrayList<>();
    for (NodeStats stat : nodeStats.getNodes()) {
        if (stat.getNode().isDataNode()) {
            dataNodeStats.add(stat);
        }
    }
    assertThat(dataNodeStats.size(), greaterThanOrEqualTo(2));
    Collections.shuffle(dataNodeStats, random());
    NodeStats primariesNode = dataNodeStats.get(0);
    NodeStats unluckyNode = dataNodeStats.get(1);
    assertAcked(prepareCreate("test").setSettings(Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "0").put(IndexMetaData.SETTING_NUMBER_OF_SHARDS, // don't go crazy here it must recovery fast
    between(1, 4)).put(MockFSIndexStore.INDEX_CHECK_INDEX_ON_CLOSE_SETTING.getKey(), false).put("index.routing.allocation.include._name", primariesNode.getNode().getName()).put(EnableAllocationDecider.INDEX_ROUTING_REBALANCE_ENABLE_SETTING.getKey(), EnableAllocationDecider.Rebalance.NONE)));
    ensureGreen();
    IndexRequestBuilder[] builders = new IndexRequestBuilder[numDocs];
    for (int i = 0; i < builders.length; i++) {
        builders[i] = client().prepareIndex("test", "type").setSource("field", "value");
    }
    indexRandom(true, builders);
    ensureGreen();
    assertAllSuccessful(client().admin().indices().prepareFlush().setForce(true).execute().actionGet());
    // we have to flush at least once here since we don't corrupt the translog
    SearchResponse countResponse = client().prepareSearch().setSize(0).get();
    assertHitCount(countResponse, numDocs);
    final boolean truncate = randomBoolean();
    for (NodeStats dataNode : dataNodeStats) {
        MockTransportService mockTransportService = ((MockTransportService) internalCluster().getInstance(TransportService.class, dataNode.getNode().getName()));
        mockTransportService.addDelegate(internalCluster().getInstance(TransportService.class, unluckyNode.getNode().getName()), new MockTransportService.DelegateTransport(mockTransportService.original()) {

            @Override
            protected void sendRequest(Connection connection, long requestId, String action, TransportRequest request, TransportRequestOptions options) throws IOException {
                if (action.equals(PeerRecoveryTargetService.Actions.FILE_CHUNK)) {
                    RecoveryFileChunkRequest req = (RecoveryFileChunkRequest) request;
                    if (truncate && req.length() > 1) {
                        BytesRef bytesRef = req.content().toBytesRef();
                        BytesArray array = new BytesArray(bytesRef.bytes, bytesRef.offset, (int) req.length() - 1);
                        request = new RecoveryFileChunkRequest(req.recoveryId(), req.shardId(), req.metadata(), req.position(), array, req.lastChunk(), req.totalTranslogOps(), req.sourceThrottleTimeInNanos());
                    } else {
                        assert req.content().toBytesRef().bytes == req.content().toBytesRef().bytes : "no internal reference!!";
                        final byte[] array = req.content().toBytesRef().bytes;
                        int i = randomIntBetween(0, req.content().length() - 1);
                        // flip one byte in the content
                        array[i] = (byte) ~array[i];
                    }
                }
                super.sendRequest(connection, requestId, action, request, options);
            }
        });
    }
    Settings build = Settings.builder().put(IndexMetaData.SETTING_NUMBER_OF_REPLICAS, "1").put("index.routing.allocation.include._name", "*").build();
    client().admin().indices().prepareUpdateSettings("test").setSettings(build).get();
    client().admin().cluster().prepareReroute().get();
    ClusterHealthResponse actionGet = client().admin().cluster().health(Requests.clusterHealthRequest("test").waitForGreenStatus()).actionGet();
    if (actionGet.isTimedOut()) {
        logger.info("ensureGreen timed out, cluster state:\n{}\n{}", client().admin().cluster().prepareState().get().getState(), client().admin().cluster().preparePendingClusterTasks().get());
        assertThat("timed out waiting for green state", actionGet.isTimedOut(), equalTo(false));
    }
    // we are green so primaries got not corrupted.
    // ensure that no shard is actually allocated on the unlucky node
    ClusterStateResponse clusterStateResponse = client().admin().cluster().prepareState().get();
    for (IndexShardRoutingTable table : clusterStateResponse.getState().getRoutingTable().index("test")) {
        for (ShardRouting routing : table) {
            if (unluckyNode.getNode().getId().equals(routing.currentNodeId())) {
                assertThat(routing.state(), not(equalTo(ShardRoutingState.STARTED)));
                assertThat(routing.state(), not(equalTo(ShardRoutingState.RELOCATING)));
            }
        }
    }
    final int numIterations = scaledRandomIntBetween(5, 20);
    for (int i = 0; i < numIterations; i++) {
        SearchResponse response = client().prepareSearch().setSize(numDocs).get();
        assertHitCount(response, numDocs);
    }
}
Also used : IndexShardRoutingTable(org.elasticsearch.cluster.routing.IndexShardRoutingTable) MockTransportService(org.elasticsearch.test.transport.MockTransportService) ClusterHealthResponse(org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse) RecoveryFileChunkRequest(org.elasticsearch.indices.recovery.RecoveryFileChunkRequest) CopyOnWriteArrayList(java.util.concurrent.CopyOnWriteArrayList) ArrayList(java.util.ArrayList) CollectionUtils.iterableAsArrayList(org.elasticsearch.common.util.CollectionUtils.iterableAsArrayList) NodesStatsResponse(org.elasticsearch.action.admin.cluster.node.stats.NodesStatsResponse) NodeStats(org.elasticsearch.action.admin.cluster.node.stats.NodeStats) TransportRequestOptions(org.elasticsearch.transport.TransportRequestOptions) BytesRef(org.apache.lucene.util.BytesRef) Settings(org.elasticsearch.common.settings.Settings) IndexSettings(org.elasticsearch.index.IndexSettings) BytesArray(org.elasticsearch.common.bytes.BytesArray) TransportRequest(org.elasticsearch.transport.TransportRequest) ClusterStateResponse(org.elasticsearch.action.admin.cluster.state.ClusterStateResponse) IOException(java.io.IOException) SearchResponse(org.elasticsearch.action.search.SearchResponse) IndexRequestBuilder(org.elasticsearch.action.index.IndexRequestBuilder) MockTransportService(org.elasticsearch.test.transport.MockTransportService) TransportService(org.elasticsearch.transport.TransportService) ShardRouting(org.elasticsearch.cluster.routing.ShardRouting)

Aggregations

IOException (java.io.IOException)3 ArrayList (java.util.ArrayList)3 NodeStats (org.elasticsearch.action.admin.cluster.node.stats.NodeStats)3 NodesStatsResponse (org.elasticsearch.action.admin.cluster.node.stats.NodesStatsResponse)3 RecoveryFileChunkRequest (org.elasticsearch.indices.recovery.RecoveryFileChunkRequest)3 MockTransportService (org.elasticsearch.test.transport.MockTransportService)3 TransportRequest (org.elasticsearch.transport.TransportRequest)3 TransportRequestOptions (org.elasticsearch.transport.TransportRequestOptions)3 TransportService (org.elasticsearch.transport.TransportService)3 CopyOnWriteArrayList (java.util.concurrent.CopyOnWriteArrayList)2 CountDownLatch (java.util.concurrent.CountDownLatch)2 AtomicBoolean (java.util.concurrent.atomic.AtomicBoolean)2 IndexRequestBuilder (org.elasticsearch.action.index.IndexRequestBuilder)2 Settings (org.elasticsearch.common.settings.Settings)2 CollectionUtils.iterableAsArrayList (org.elasticsearch.common.util.CollectionUtils.iterableAsArrayList)2 IndexSettings (org.elasticsearch.index.IndexSettings)2 BytesRef (org.apache.lucene.util.BytesRef)1 ClusterHealthResponse (org.elasticsearch.action.admin.cluster.health.ClusterHealthResponse)1 ClusterStateResponse (org.elasticsearch.action.admin.cluster.state.ClusterStateResponse)1 SearchResponse (org.elasticsearch.action.search.SearchResponse)1