use of io.crate.exceptions.TaskMissing in project crate by crate.
the class TransportDistributedResultAction method retryOrFailureResponse.
private CompletableFuture<DistributedResultResponse> retryOrFailureResponse(DistributedResultRequest request, @Nullable Iterator<TimeValue> retryDelay) {
if (retryDelay == null) {
retryDelay = backoffPolicy.iterator();
}
if (retryDelay.hasNext()) {
TimeValue delay = retryDelay.next();
if (LOGGER.isTraceEnabled()) {
LOGGER.trace("scheduling retry to start node operation for jobId: {} in {}ms", request.jobId(), delay.getMillis());
}
NodeOperationRunnable operationRunnable = new NodeOperationRunnable(request, retryDelay);
scheduler.schedule(operationRunnable::run, delay.getMillis(), TimeUnit.MILLISECONDS);
return operationRunnable;
} else {
if (LOGGER.isTraceEnabled()) {
LOGGER.trace("Received a result for job={} but couldn't find a RootTask for it", request.jobId());
}
List<String> excludedNodeIds = Collections.singletonList(clusterService.localNode().getId());
/* The upstream (DistributingConsumer) forwards failures to other downstreams and eventually considers its job done.
* But it cannot inform the handler-merge about a failure because the JobResponse is sent eagerly.
*
* The handler local-merge would get stuck if not all its upstreams send their requests, so we need to invoke
* a kill to make sure that doesn't happen.
*/
KillJobsRequest killRequest = new KillJobsRequest(List.of(request.jobId()), User.CRATE_USER.name(), "Received data for job=" + request.jobId() + " but there is no job context present. " + "This can happen due to bad network latency or if individual nodes are unresponsive due to high load");
killJobsAction.broadcast(killRequest, new ActionListener<>() {
@Override
public void onResponse(Long numKilled) {
}
@Override
public void onFailure(Exception e) {
LOGGER.debug("Could not kill " + request.jobId(), e);
}
}, excludedNodeIds);
return CompletableFuture.failedFuture(new TaskMissing(TaskMissing.Type.ROOT, request.jobId()));
}
}
Aggregations