use of org.opensearch.transport.ReceiveTimeoutTransportException in project OpenSearch by opensearch-project.
the class AsyncShardFetch method processAsyncFetch.
/**
* Called by the response handler of the async action to fetch data. Verifies that its still working
* on the same cache generation, otherwise the results are discarded. It then goes and fills the relevant data for
* the shard (response + failures), issuing a reroute at the end of it to make sure there will be another round
* of allocations taking this new data into account.
*/
protected synchronized void processAsyncFetch(List<T> responses, List<FailedNodeException> failures, long fetchingRound) {
if (closed) {
// we are closed, no need to process this async fetch at all
logger.trace("{} ignoring fetched [{}] results, already closed", shardId, type);
return;
}
logger.trace("{} processing fetched [{}] results", shardId, type);
if (responses != null) {
for (T response : responses) {
NodeEntry<T> nodeEntry = cache.get(response.getNode().getId());
if (nodeEntry != null) {
if (nodeEntry.getFetchingRound() != fetchingRound) {
assert nodeEntry.getFetchingRound() > fetchingRound : "node entries only replaced by newer rounds";
logger.trace("{} received response for [{}] from node {} for an older fetching round (expected: {} but was: {})", shardId, nodeEntry.getNodeId(), type, nodeEntry.getFetchingRound(), fetchingRound);
} else if (nodeEntry.isFailed()) {
logger.trace("{} node {} has failed for [{}] (failure [{}])", shardId, nodeEntry.getNodeId(), type, nodeEntry.getFailure());
} else {
// if the entry is there, for the right fetching round and not marked as failed already, process it
logger.trace("{} marking {} as done for [{}], result is [{}]", shardId, nodeEntry.getNodeId(), type, response);
nodeEntry.doneFetching(response);
}
}
}
}
if (failures != null) {
for (FailedNodeException failure : failures) {
logger.trace("{} processing failure {} for [{}]", shardId, failure, type);
NodeEntry<T> nodeEntry = cache.get(failure.nodeId());
if (nodeEntry != null) {
if (nodeEntry.getFetchingRound() != fetchingRound) {
assert nodeEntry.getFetchingRound() > fetchingRound : "node entries only replaced by newer rounds";
logger.trace("{} received failure for [{}] from node {} for an older fetching round (expected: {} but was: {})", shardId, nodeEntry.getNodeId(), type, nodeEntry.getFetchingRound(), fetchingRound);
} else if (nodeEntry.isFailed() == false) {
// if the entry is there, for the right fetching round and not marked as failed already, process it
Throwable unwrappedCause = ExceptionsHelper.unwrapCause(failure.getCause());
// if the request got rejected or timed out, we need to try it again next time...
if (unwrappedCause instanceof OpenSearchRejectedExecutionException || unwrappedCause instanceof ReceiveTimeoutTransportException || unwrappedCause instanceof OpenSearchTimeoutException) {
nodeEntry.restartFetching();
} else {
logger.warn(() -> new ParameterizedMessage("{}: failed to list shard for {} on node [{}]", shardId, type, failure.nodeId()), failure);
nodeEntry.doneFetching(failure.getCause());
}
}
}
}
}
reroute(shardId, "post_response");
}
use of org.opensearch.transport.ReceiveTimeoutTransportException in project OpenSearch by opensearch-project.
the class InternalClusterInfoService method refresh.
/**
* Refreshes the ClusterInfo in a blocking fashion
*/
public final ClusterInfo refresh() {
logger.trace("refreshing cluster info");
final CountDownLatch nodeLatch = updateNodeStats(new ActionListener<NodesStatsResponse>() {
@Override
public void onResponse(NodesStatsResponse nodesStatsResponse) {
ImmutableOpenMap.Builder<String, DiskUsage> leastAvailableUsagesBuilder = ImmutableOpenMap.builder();
ImmutableOpenMap.Builder<String, DiskUsage> mostAvailableUsagesBuilder = ImmutableOpenMap.builder();
fillDiskUsagePerNode(logger, adjustNodesStats(nodesStatsResponse.getNodes()), leastAvailableUsagesBuilder, mostAvailableUsagesBuilder);
leastAvailableSpaceUsages = leastAvailableUsagesBuilder.build();
mostAvailableSpaceUsages = mostAvailableUsagesBuilder.build();
}
@Override
public void onFailure(Exception e) {
if (e instanceof ReceiveTimeoutTransportException) {
logger.error("NodeStatsAction timed out for ClusterInfoUpdateJob", e);
} else {
if (e instanceof ClusterBlockException) {
if (logger.isTraceEnabled()) {
logger.trace("Failed to execute NodeStatsAction for ClusterInfoUpdateJob", e);
}
} else {
logger.warn("Failed to execute NodeStatsAction for ClusterInfoUpdateJob", e);
}
// we empty the usages list, to be safe - we don't know what's going on.
leastAvailableSpaceUsages = ImmutableOpenMap.of();
mostAvailableSpaceUsages = ImmutableOpenMap.of();
}
}
});
final CountDownLatch indicesLatch = updateIndicesStats(new ActionListener<IndicesStatsResponse>() {
@Override
public void onResponse(IndicesStatsResponse indicesStatsResponse) {
final ShardStats[] stats = indicesStatsResponse.getShards();
final ImmutableOpenMap.Builder<String, Long> shardSizeByIdentifierBuilder = ImmutableOpenMap.builder();
final ImmutableOpenMap.Builder<ShardRouting, String> dataPathByShardRoutingBuilder = ImmutableOpenMap.builder();
final Map<ClusterInfo.NodeAndPath, ClusterInfo.ReservedSpace.Builder> reservedSpaceBuilders = new HashMap<>();
buildShardLevelInfo(logger, stats, shardSizeByIdentifierBuilder, dataPathByShardRoutingBuilder, reservedSpaceBuilders);
final ImmutableOpenMap.Builder<ClusterInfo.NodeAndPath, ClusterInfo.ReservedSpace> rsrvdSpace = ImmutableOpenMap.builder();
reservedSpaceBuilders.forEach((nodeAndPath, builder) -> rsrvdSpace.put(nodeAndPath, builder.build()));
indicesStatsSummary = new IndicesStatsSummary(shardSizeByIdentifierBuilder.build(), dataPathByShardRoutingBuilder.build(), rsrvdSpace.build());
}
@Override
public void onFailure(Exception e) {
if (e instanceof ReceiveTimeoutTransportException) {
logger.error("IndicesStatsAction timed out for ClusterInfoUpdateJob", e);
} else {
if (e instanceof ClusterBlockException) {
if (logger.isTraceEnabled()) {
logger.trace("Failed to execute IndicesStatsAction for ClusterInfoUpdateJob", e);
}
} else {
logger.warn("Failed to execute IndicesStatsAction for ClusterInfoUpdateJob", e);
}
// we empty the usages list, to be safe - we don't know what's going on.
indicesStatsSummary = IndicesStatsSummary.EMPTY;
}
}
});
try {
if (nodeLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS) == false) {
logger.warn("Failed to update node information for ClusterInfoUpdateJob within {} timeout", fetchTimeout);
}
} catch (InterruptedException e) {
// restore interrupt status
Thread.currentThread().interrupt();
}
try {
if (indicesLatch.await(fetchTimeout.getMillis(), TimeUnit.MILLISECONDS) == false) {
logger.warn("Failed to update shard information for ClusterInfoUpdateJob within {} timeout", fetchTimeout);
}
} catch (InterruptedException e) {
// restore interrupt status
Thread.currentThread().interrupt();
}
ClusterInfo clusterInfo = getClusterInfo();
boolean anyListeners = false;
for (final Consumer<ClusterInfo> listener : listeners) {
anyListeners = true;
try {
logger.trace("notifying [{}] of new cluster info", listener);
listener.accept(clusterInfo);
} catch (Exception e) {
logger.info(new ParameterizedMessage("failed to notify [{}] of new cluster info", listener), e);
}
}
assert anyListeners : "expected to notify at least one listener";
return clusterInfo;
}
use of org.opensearch.transport.ReceiveTimeoutTransportException in project OpenSearch by opensearch-project.
the class ClientTimeoutIT method simulateTimeoutAtTransport.
private void simulateTimeoutAtTransport(String dataNode, String anotherDataNode, String transportActionName) {
MockTransportService mockTransportService = ((MockTransportService) internalCluster().getInstance(TransportService.class, dataNode));
StubbableTransport.SendRequestBehavior sendBehaviour = (connection, requestId, action, request, options) -> {
if (action.startsWith(transportActionName)) {
throw new ReceiveTimeoutTransportException(connection.getNode(), action, "simulate timeout");
}
connection.sendRequest(requestId, action, request, options);
};
mockTransportService.addSendBehavior(internalCluster().getInstance(TransportService.class, anotherDataNode), sendBehaviour);
MockTransportService mockTransportServiceAnotherNode = ((MockTransportService) internalCluster().getInstance(TransportService.class, anotherDataNode));
mockTransportServiceAnotherNode.addSendBehavior(internalCluster().getInstance(TransportService.class, dataNode), sendBehaviour);
}
use of org.opensearch.transport.ReceiveTimeoutTransportException in project OpenSearch by opensearch-project.
the class TransportBroadcastByNodeActionTests method testResultWithTimeouts.
public void testResultWithTimeouts() throws ExecutionException, InterruptedException {
Request request = new Request(new String[] { TEST_INDEX });
PlainActionFuture<Response> listener = new PlainActionFuture<>();
action.new AsyncAction(null, request, listener).start();
Map<String, List<CapturingTransport.CapturedRequest>> capturedRequests = transport.getCapturedRequestsByTargetNodeAndClear();
ShardsIterator shardIt = clusterService.state().getRoutingTable().allShards(new String[] { TEST_INDEX });
Map<String, List<ShardRouting>> map = new HashMap<>();
for (ShardRouting shard : shardIt) {
if (!map.containsKey(shard.currentNodeId())) {
map.put(shard.currentNodeId(), new ArrayList<>());
}
map.get(shard.currentNodeId()).add(shard);
}
int totalShards = 0;
int totalSuccessfulShards = 0;
int totalFailedShards = 0;
String failedNodeId = "node_" + randomIntBetween(0, capturedRequests.size());
for (Map.Entry<String, List<CapturingTransport.CapturedRequest>> entry : capturedRequests.entrySet()) {
List<BroadcastShardOperationFailedException> exceptions = new ArrayList<>();
long requestId = entry.getValue().get(0).requestId;
if (entry.getKey().equals(failedNodeId)) {
// simulate node timeout
totalShards += map.get(entry.getKey()).size();
totalFailedShards += map.get(entry.getKey()).size();
transport.handleError(requestId, new ReceiveTimeoutTransportException(clusterService.state().getRoutingNodes().node(entry.getKey()).node(), "indices:admin/test", "time_out_simulated"));
} else {
List<ShardRouting> shards = map.get(entry.getKey());
List<TransportBroadcastByNodeAction.EmptyResult> shardResults = new ArrayList<>();
for (ShardRouting shard : shards) {
totalShards++;
if (rarely()) {
// simulate operation failure
totalFailedShards++;
exceptions.add(new BroadcastShardOperationFailedException(shard.shardId(), "operation indices:admin/test failed"));
} else {
shardResults.add(TransportBroadcastByNodeAction.EmptyResult.INSTANCE);
}
}
totalSuccessfulShards += shardResults.size();
TransportBroadcastByNodeAction.NodeResponse nodeResponse = action.new NodeResponse(entry.getKey(), shards.size(), shardResults, exceptions);
transport.handleResponse(requestId, nodeResponse);
}
}
Response response = listener.get();
assertEquals("total shards", totalShards, response.getTotalShards());
assertEquals("successful shards", totalSuccessfulShards, response.getSuccessfulShards());
assertEquals("failed shards", totalFailedShards, response.getFailedShards());
assertEquals("accumulated exceptions", totalFailedShards, response.getShardFailures().length);
}
Aggregations