Search in sources :

Example 1 with ReceiveTimeoutTransportException

use of org.opensearch.transport.ReceiveTimeoutTransportException in project OpenSearch by opensearch-project.

the class AsyncShardFetch method processAsyncFetch.

/**
 * Called by the response handler of the async action to fetch data. Verifies that its still working
 * on the same cache generation, otherwise the results are discarded. It then goes and fills the relevant data for
 * the shard (response + failures), issuing a reroute at the end of it to make sure there will be another round
 * of allocations taking this new data into account.
 */
protected synchronized void processAsyncFetch(List<T> responses, List<FailedNodeException> failures, long fetchingRound) {
    if (closed) {
        // we are closed, no need to process this async fetch at all
        logger.trace("{} ignoring fetched [{}] results, already closed", shardId, type);
        return;
    }
    logger.trace("{} processing fetched [{}] results", shardId, type);
    if (responses != null) {
        for (T response : responses) {
            NodeEntry<T> nodeEntry = cache.get(response.getNode().getId());
            if (nodeEntry != null) {
                if (nodeEntry.getFetchingRound() != fetchingRound) {
                    assert nodeEntry.getFetchingRound() > fetchingRound : "node entries only replaced by newer rounds";
                    logger.trace("{} received response for [{}] from node {} for an older fetching round (expected: {} but was: {})", shardId, nodeEntry.getNodeId(), type, nodeEntry.getFetchingRound(), fetchingRound);
                } else if (nodeEntry.isFailed()) {
                    logger.trace("{} node {} has failed for [{}] (failure [{}])", shardId, nodeEntry.getNodeId(), type, nodeEntry.getFailure());
                } else {
                    // if the entry is there, for the right fetching round and not marked as failed already, process it
                    logger.trace("{} marking {} as done for [{}], result is [{}]", shardId, nodeEntry.getNodeId(), type, response);
                    nodeEntry.doneFetching(response);
                }
            }
        }
    }
    if (failures != null) {
        for (FailedNodeException failure : failures) {
            logger.trace("{} processing failure {} for [{}]", shardId, failure, type);
            NodeEntry<T> nodeEntry = cache.get(failure.nodeId());
            if (nodeEntry != null) {
                if (nodeEntry.getFetchingRound() != fetchingRound) {
                    assert nodeEntry.getFetchingRound() > fetchingRound : "node entries only replaced by newer rounds";
                    logger.trace("{} received failure for [{}] from node {} for an older fetching round (expected: {} but was: {})", shardId, nodeEntry.getNodeId(), type, nodeEntry.getFetchingRound(), fetchingRound);
                } else if (nodeEntry.isFailed() == false) {
                    // if the entry is there, for the right fetching round and not marked as failed already, process it
                    Throwable unwrappedCause = ExceptionsHelper.unwrapCause(failure.getCause());
                    // if the request got rejected or timed out, we need to try it again next time...
                    if (unwrappedCause instanceof OpenSearchRejectedExecutionException || unwrappedCause instanceof ReceiveTimeoutTransportException || unwrappedCause instanceof OpenSearchTimeoutException) {
                        nodeEntry.restartFetching();
                    } else {
                        logger.warn(() -> new ParameterizedMessage("{}: failed to list shard for {} on node [{}]", shardId, type, failure.nodeId()), failure);
                        nodeEntry.doneFetching(failure.getCause());
                    }
                }
            }
        }
    }
    reroute(shardId, "post_response");
}
Also used : ReceiveTimeoutTransportException(org.opensearch.transport.ReceiveTimeoutTransportException) OpenSearchTimeoutException(org.opensearch.OpenSearchTimeoutException) OpenSearchRejectedExecutionException(org.opensearch.common.util.concurrent.OpenSearchRejectedExecutionException) FailedNodeException(org.opensearch.action.FailedNodeException) ParameterizedMessage(org.apache.logging.log4j.message.ParameterizedMessage)

Example 2 with ReceiveTimeoutTransportException

use of org.opensearch.transport.ReceiveTimeoutTransportException in project OpenSearch by opensearch-project.

the class InternalClusterInfoService method refresh.

/**
 * Refreshes the ClusterInfo in a blocking fashion
 */
public final ClusterInfo refresh() {
    logger.trace("refreshing cluster info");
    final CountDownLatch nodeLatch = updateNodeStats(new ActionListener<NodesStatsResponse>() {

        @Override
        public void onResponse(NodesStatsResponse nodesStatsResponse) {
            ImmutableOpenMap.Builder<String, DiskUsage> leastAvailableUsagesBuilder = ImmutableOpenMap.builder();
            ImmutableOpenMap.Builder<String, DiskUsage> mostAvailableUsagesBuilder = ImmutableOpenMap.builder();
            fillDiskUsagePerNode(logger, adjustNodesStats(nodesStatsResponse.getNodes()), leastAvailableUsagesBuilder, mostAvailableUsagesBuilder);
            leastAvailableSpaceUsages = leastAvailableUsagesBuilder.build();
            mostAvailableSpaceUsages = mostAvailableUsagesBuilder.build();
        }

        @Override
        public void onFailure(Exception e) {
            if (e instanceof ReceiveTimeoutTransportException) {
                logger.error("NodeStatsAction timed out for ClusterInfoUpdateJob", e);
            } else {
                if (e instanceof ClusterBlockException) {
                    if (logger.isTraceEnabled()) {
                        logger.trace("Failed to execute NodeStatsAction for ClusterInfoUpdateJob", e);
                    }
                } else {
                    logger.warn("Failed to execute NodeStatsAction for ClusterInfoUpdateJob", e);
                }
                // we empty the usages list, to be safe - we don't know what's going on.
                leastAvailableSpaceUsages = ImmutableOpenMap.of();
                mostAvailableSpaceUsages = ImmutableOpenMap.of();
            }
        }
    });
    final CountDownLatch indicesLatch = updateIndicesStats(new ActionListener<IndicesStatsResponse>() {

        @Override
        public void onResponse(IndicesStatsResponse indicesStatsResponse) {
            final ShardStats[] stats = indicesStatsResponse.getShards();
            final ImmutableOpenMap.Builder<String, Long> shardSizeByIdentifierBuilder = ImmutableOpenMap.builder();
            final ImmutableOpenMap.Builder<ShardRouting, String> dataPathByShardRoutingBuilder = ImmutableOpenMap.builder();
            final Map<ClusterInfo.NodeAndPath, ClusterInfo.ReservedSpace.Builder> reservedSpaceBuilders = new HashMap<>();
            buildShardLevelInfo(logger, stats, shardSizeByIdentifierBuilder, dataPathByShardRoutingBuilder, reservedSpaceBuilders);
            final ImmutableOpenMap.Builder<ClusterInfo.NodeAndPath, ClusterInfo.ReservedSpace> rsrvdSpace = ImmutableOpenMap.builder();
            reservedSpaceBuilders.forEach((nodeAndPath, builder) -> rsrvdSpace.put(nodeAndPath, builder.build()));
            indicesStatsSummary = new IndicesStatsSummary(shardSizeByIdentifierBuilder.build(), dataPathByShardRoutingBuilder.build(), rsrvdSpace.build());
        }

        @Override
        public void onFailure(Exception e) {
            if (e instanceof ReceiveTimeoutTransportException) {
                logger.error("IndicesStatsAction timed out for ClusterInfoUpdateJob", e);
            } else {
                if (e instanceof ClusterBlockException) {
                    if (logger.isTraceEnabled()) {
                        logger.trace("Failed to execute IndicesStatsAction for ClusterInfoUpdateJob", e);
                    }
                } else {
                    logger.warn("Failed to execute IndicesStatsAction for ClusterInfoUpdateJob", e);
                }
                // we empty the usages list, to be safe - we don't know what's going on.
                indicesStatsSummary = IndicesStatsSummary.EMPTY;
            }
        }
    });
    try {
        if (nodeLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS) == false) {
            logger.warn("Failed to update node information for ClusterInfoUpdateJob within {} timeout", fetchTimeout);
        }
    } catch (InterruptedException e) {
        // restore interrupt status
        Thread.currentThread().interrupt();
    }
    try {
        if (indicesLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS) == false) {
            logger.warn("Failed to update shard information for ClusterInfoUpdateJob within {} timeout", fetchTimeout);
        }
    } catch (InterruptedException e) {
        // restore interrupt status
        Thread.currentThread().interrupt();
    }
    ClusterInfo clusterInfo = getClusterInfo();
    boolean anyListeners = false;
    for (final Consumer<ClusterInfo> listener : listeners) {
        anyListeners = true;
        try {
            logger.trace("notifying [{}] of new cluster info", listener);
            listener.accept(clusterInfo);
        } catch (Exception e) {
            logger.info(new ParameterizedMessage("failed to notify [{}] of new cluster info", listener), e);
        }
    }
    assert anyListeners : "expected to notify at least one listener";
    return clusterInfo;
}
Also used : OpenSearchRejectedExecutionException(org.opensearch.common.util.concurrent.OpenSearchRejectedExecutionException) ImmutableOpenMap(org.opensearch.common.collect.ImmutableOpenMap) AbstractRunnable(org.opensearch.common.util.concurrent.AbstractRunnable) ThreadPool(org.opensearch.threadpool.ThreadPool) Level(org.apache.logging.log4j.Level) HashMap(java.util.HashMap) IndicesOptions(org.opensearch.action.support.IndicesOptions) DiskThresholdSettings(org.opensearch.cluster.routing.allocation.DiskThresholdSettings) ParameterizedMessage(org.apache.logging.log4j.message.ParameterizedMessage) AtomicReference(java.util.concurrent.atomic.AtomicReference) DiscoveryNode(org.opensearch.cluster.node.DiscoveryNode) LatchedActionListener(org.opensearch.action.LatchedActionListener) Property(org.opensearch.common.settings.Setting.Property) Map(java.util.Map) ActionListener(org.opensearch.action.ActionListener) ClusterSettings(org.opensearch.common.settings.ClusterSettings) StoreStats(org.opensearch.index.store.StoreStats) Client(org.opensearch.client.Client) Setting(org.opensearch.common.settings.Setting) TimeValue(org.opensearch.common.unit.TimeValue) ClusterBlockException(org.opensearch.cluster.block.ClusterBlockException) Settings(org.opensearch.common.settings.Settings) IndicesStatsRequest(org.opensearch.action.admin.indices.stats.IndicesStatsRequest) ShardRouting(org.opensearch.cluster.routing.ShardRouting) TimeUnit(java.util.concurrent.TimeUnit) Consumer(java.util.function.Consumer) CountDownLatch(java.util.concurrent.CountDownLatch) List(java.util.List) Logger(org.apache.logging.log4j.Logger) NodesStatsResponse(org.opensearch.action.admin.cluster.node.stats.NodesStatsResponse) ClusterService(org.opensearch.cluster.service.ClusterService) NodeStats(org.opensearch.action.admin.cluster.node.stats.NodeStats) NodesStatsRequest(org.opensearch.action.admin.cluster.node.stats.NodesStatsRequest) IndicesStatsResponse(org.opensearch.action.admin.indices.stats.IndicesStatsResponse) ShardStats(org.opensearch.action.admin.indices.stats.ShardStats) LogManager(org.apache.logging.log4j.LogManager) ReceiveTimeoutTransportException(org.opensearch.transport.ReceiveTimeoutTransportException) FsInfo(org.opensearch.monitor.fs.FsInfo) CopyOnWriteArrayList(java.util.concurrent.CopyOnWriteArrayList) IndicesStatsResponse(org.opensearch.action.admin.indices.stats.IndicesStatsResponse) CountDownLatch(java.util.concurrent.CountDownLatch) ClusterBlockException(org.opensearch.cluster.block.ClusterBlockException) OpenSearchRejectedExecutionException(org.opensearch.common.util.concurrent.OpenSearchRejectedExecutionException) ClusterBlockException(org.opensearch.cluster.block.ClusterBlockException) ReceiveTimeoutTransportException(org.opensearch.transport.ReceiveTimeoutTransportException) NodesStatsResponse(org.opensearch.action.admin.cluster.node.stats.NodesStatsResponse) ReceiveTimeoutTransportException(org.opensearch.transport.ReceiveTimeoutTransportException) ParameterizedMessage(org.apache.logging.log4j.message.ParameterizedMessage) ImmutableOpenMap(org.opensearch.common.collect.ImmutableOpenMap) HashMap(java.util.HashMap) Map(java.util.Map)

Example 3 with ReceiveTimeoutTransportException

use of org.opensearch.transport.ReceiveTimeoutTransportException in project OpenSearch by opensearch-project.

the class ClientTimeoutIT method simulateTimeoutAtTransport.

private void simulateTimeoutAtTransport(String dataNode, String anotherDataNode, String transportActionName) {
    MockTransportService mockTransportService = ((MockTransportService) internalCluster().getInstance(TransportService.class, dataNode));
    StubbableTransport.SendRequestBehavior sendBehaviour = (connection, requestId, action, request, options) -> {
        if (action.startsWith(transportActionName)) {
            throw new ReceiveTimeoutTransportException(connection.getNode(), action, "simulate timeout");
        }
        connection.sendRequest(requestId, action, request, options);
    };
    mockTransportService.addSendBehavior(internalCluster().getInstance(TransportService.class, anotherDataNode), sendBehaviour);
    MockTransportService mockTransportServiceAnotherNode = ((MockTransportService) internalCluster().getInstance(TransportService.class, anotherDataNode));
    mockTransportServiceAnotherNode.addSendBehavior(internalCluster().getInstance(TransportService.class, dataNode), sendBehaviour);
}
Also used : CoreMatchers.is(org.hamcrest.CoreMatchers.is) StubbableTransport(org.opensearch.test.transport.StubbableTransport) MockTransportService(org.opensearch.test.transport.MockTransportService) ArrayList(java.util.ArrayList) ListTasksResponse(org.opensearch.action.admin.cluster.node.tasks.list.ListTasksResponse) NodesInfoAction(org.opensearch.action.admin.cluster.node.info.NodesInfoAction) Matchers.lessThan(org.hamcrest.Matchers.lessThan) RecoveryAction(org.opensearch.action.admin.indices.recovery.RecoveryAction) OpenSearchAssertions.assertAcked(org.opensearch.test.hamcrest.OpenSearchAssertions.assertAcked) TimeValue(org.opensearch.common.unit.TimeValue) Collection(java.util.Collection) NodesStatsAction(org.opensearch.action.admin.cluster.node.stats.NodesStatsAction) Settings(org.opensearch.common.settings.Settings) TransportService(org.opensearch.transport.TransportService) Plugin(org.opensearch.plugins.Plugin) NodesInfoResponse(org.opensearch.action.admin.cluster.node.info.NodesInfoResponse) RecoveryResponse(org.opensearch.action.admin.indices.recovery.RecoveryResponse) NodeInfo(org.opensearch.action.admin.cluster.node.info.NodeInfo) IndicesStatsAction(org.opensearch.action.admin.indices.stats.IndicesStatsAction) Matchers.equalTo(org.hamcrest.Matchers.equalTo) NodesStatsResponse(org.opensearch.action.admin.cluster.node.stats.NodesStatsResponse) NodeStats(org.opensearch.action.admin.cluster.node.stats.NodeStats) IndicesStatsResponse(org.opensearch.action.admin.indices.stats.IndicesStatsResponse) OpenSearchIntegTestCase(org.opensearch.test.OpenSearchIntegTestCase) Collections(java.util.Collections) Matchers.containsString(org.hamcrest.Matchers.containsString) ReceiveTimeoutTransportException(org.opensearch.transport.ReceiveTimeoutTransportException) ReceiveTimeoutTransportException(org.opensearch.transport.ReceiveTimeoutTransportException) MockTransportService(org.opensearch.test.transport.MockTransportService) MockTransportService(org.opensearch.test.transport.MockTransportService) TransportService(org.opensearch.transport.TransportService) StubbableTransport(org.opensearch.test.transport.StubbableTransport)

Example 4 with ReceiveTimeoutTransportException

use of org.opensearch.transport.ReceiveTimeoutTransportException in project OpenSearch by opensearch-project.

the class TransportBroadcastByNodeActionTests method testResultWithTimeouts.

public void testResultWithTimeouts() throws ExecutionException, InterruptedException {
    Request request = new Request(new String[] { TEST_INDEX });
    PlainActionFuture<Response> listener = new PlainActionFuture<>();
    action.new AsyncAction(null, request, listener).start();
    Map<String, List<CapturingTransport.CapturedRequest>> capturedRequests = transport.getCapturedRequestsByTargetNodeAndClear();
    ShardsIterator shardIt = clusterService.state().getRoutingTable().allShards(new String[] { TEST_INDEX });
    Map<String, List<ShardRouting>> map = new HashMap<>();
    for (ShardRouting shard : shardIt) {
        if (!map.containsKey(shard.currentNodeId())) {
            map.put(shard.currentNodeId(), new ArrayList<>());
        }
        map.get(shard.currentNodeId()).add(shard);
    }
    int totalShards = 0;
    int totalSuccessfulShards = 0;
    int totalFailedShards = 0;
    String failedNodeId = "node_" + randomIntBetween(0, capturedRequests.size());
    for (Map.Entry<String, List<CapturingTransport.CapturedRequest>> entry : capturedRequests.entrySet()) {
        List<BroadcastShardOperationFailedException> exceptions = new ArrayList<>();
        long requestId = entry.getValue().get(0).requestId;
        if (entry.getKey().equals(failedNodeId)) {
            // simulate node timeout
            totalShards += map.get(entry.getKey()).size();
            totalFailedShards += map.get(entry.getKey()).size();
            transport.handleError(requestId, new ReceiveTimeoutTransportException(clusterService.state().getRoutingNodes().node(entry.getKey()).node(), "indices:admin/test", "time_out_simulated"));
        } else {
            List<ShardRouting> shards = map.get(entry.getKey());
            List<TransportBroadcastByNodeAction.EmptyResult> shardResults = new ArrayList<>();
            for (ShardRouting shard : shards) {
                totalShards++;
                if (rarely()) {
                    // simulate operation failure
                    totalFailedShards++;
                    exceptions.add(new BroadcastShardOperationFailedException(shard.shardId(), "operation indices:admin/test failed"));
                } else {
                    shardResults.add(TransportBroadcastByNodeAction.EmptyResult.INSTANCE);
                }
            }
            totalSuccessfulShards += shardResults.size();
            TransportBroadcastByNodeAction.NodeResponse nodeResponse = action.new NodeResponse(entry.getKey(), shards.size(), shardResults, exceptions);
            transport.handleResponse(requestId, nodeResponse);
        }
    }
    Response response = listener.get();
    assertEquals("total shards", totalShards, response.getTotalShards());
    assertEquals("successful shards", totalSuccessfulShards, response.getSuccessfulShards());
    assertEquals("failed shards", totalFailedShards, response.getFailedShards());
    assertEquals("accumulated exceptions", totalFailedShards, response.getShardFailures().length);
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) CoreMatchers.containsString(org.hamcrest.CoreMatchers.containsString) HasToString.hasToString(org.hamcrest.object.HasToString.hasToString) ShardsIterator(org.opensearch.cluster.routing.ShardsIterator) ReceiveTimeoutTransportException(org.opensearch.transport.ReceiveTimeoutTransportException) List(java.util.List) ArrayList(java.util.ArrayList) BroadcastShardOperationFailedException(org.opensearch.action.support.broadcast.BroadcastShardOperationFailedException) CapturingTransport(org.opensearch.test.transport.CapturingTransport) BroadcastRequest(org.opensearch.action.support.broadcast.BroadcastRequest) IndicesRequest(org.opensearch.action.IndicesRequest) TransportResponse(org.opensearch.transport.TransportResponse) BroadcastResponse(org.opensearch.action.support.broadcast.BroadcastResponse) PlainActionFuture(org.opensearch.action.support.PlainActionFuture) ShardRouting(org.opensearch.cluster.routing.ShardRouting) TestShardRouting(org.opensearch.cluster.routing.TestShardRouting) Map(java.util.Map) HashMap(java.util.HashMap) Collections.emptyMap(java.util.Collections.emptyMap)

Aggregations

ReceiveTimeoutTransportException (org.opensearch.transport.ReceiveTimeoutTransportException)4 ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 List (java.util.List)2 Map (java.util.Map)2 ParameterizedMessage (org.apache.logging.log4j.message.ParameterizedMessage)2 NodeStats (org.opensearch.action.admin.cluster.node.stats.NodeStats)2 NodesStatsResponse (org.opensearch.action.admin.cluster.node.stats.NodesStatsResponse)2 IndicesStatsResponse (org.opensearch.action.admin.indices.stats.IndicesStatsResponse)2 ShardRouting (org.opensearch.cluster.routing.ShardRouting)2 Settings (org.opensearch.common.settings.Settings)2 TimeValue (org.opensearch.common.unit.TimeValue)2 OpenSearchRejectedExecutionException (org.opensearch.common.util.concurrent.OpenSearchRejectedExecutionException)2 Collection (java.util.Collection)1 Collections (java.util.Collections)1 Collections.emptyMap (java.util.Collections.emptyMap)1 CopyOnWriteArrayList (java.util.concurrent.CopyOnWriteArrayList)1 CountDownLatch (java.util.concurrent.CountDownLatch)1 TimeUnit (java.util.concurrent.TimeUnit)1 AtomicReference (java.util.concurrent.atomic.AtomicReference)1