use of io.trino.execution.TaskState in project trino by trinodb.
the class FaultTolerantStageScheduler method updateTaskStatus.
private void updateTaskStatus(TaskStatus taskStatus, Optional<ExchangeSinkInstanceHandle> exchangeSinkInstanceHandle) {
TaskState state = taskStatus.getState();
if (!state.isDone()) {
return;
}
try {
RuntimeException failure = null;
SettableFuture<Void> future;
synchronized (this) {
TaskId taskId = taskStatus.getTaskId();
runningTasks.remove(taskId);
future = taskFinishedFuture;
if (!runningTasks.isEmpty()) {
taskFinishedFuture = SettableFuture.create();
} else {
taskFinishedFuture = null;
}
NodeAllocator.NodeLease nodeLease = requireNonNull(runningNodes.remove(taskId), () -> "node not found for task id: " + taskId);
nodeLease.release();
int partitionId = taskId.getPartitionId();
if (!finishedPartitions.contains(partitionId) && !closed) {
switch(state) {
case FINISHED:
finishedPartitions.add(partitionId);
if (sinkExchange.isPresent()) {
checkArgument(exchangeSinkInstanceHandle.isPresent(), "exchangeSinkInstanceHandle is expected to be present");
sinkExchange.get().sinkFinished(exchangeSinkInstanceHandle.get());
}
partitionToRemoteTaskMap.get(partitionId).forEach(RemoteTask::abort);
break;
case CANCELED:
log.debug("Task cancelled: %s", taskId);
break;
case ABORTED:
log.debug("Task aborted: %s", taskId);
break;
case FAILED:
ExecutionFailureInfo failureInfo = taskStatus.getFailures().stream().findFirst().map(this::rewriteTransportFailure).orElse(toFailure(new TrinoException(GENERIC_INTERNAL_ERROR, "A task failed for an unknown reason")));
log.warn(failureInfo.toException(), "Task failed: %s", taskId);
ErrorCode errorCode = failureInfo.getErrorCode();
int taskRemainingAttempts = remainingAttemptsPerTask.getOrDefault(partitionId, maxRetryAttemptsPerTask);
if (remainingRetryAttemptsOverall > 0 && taskRemainingAttempts > 0 && (errorCode == null || errorCode.getType() != USER_ERROR)) {
remainingRetryAttemptsOverall--;
remainingAttemptsPerTask.put(partitionId, taskRemainingAttempts - 1);
// update memory limits for next attempt
MemoryRequirements memoryLimits = partitionMemoryRequirements.get(partitionId);
verify(memoryLimits != null);
MemoryRequirements newMemoryLimits = partitionMemoryEstimator.getNextRetryMemoryRequirements(session, memoryLimits, errorCode);
partitionMemoryRequirements.put(partitionId, newMemoryLimits);
// reschedule
queuedPartitions.add(partitionId);
log.debug("Retrying partition %s for stage %s", partitionId, stage.getStageId());
} else {
failure = failureInfo.toException();
}
break;
default:
throw new IllegalArgumentException("Unexpected task state: " + state);
}
}
}
if (failure != null) {
// must be called outside the lock
fail(failure);
}
if (future != null && !future.isDone()) {
future.set(null);
}
} catch (Throwable t) {
fail(t);
}
}
use of io.trino.execution.TaskState in project trino by trinodb.
the class TaskResource method getResults.
@ResourceSecurity(INTERNAL_ONLY)
@GET
@Path("{taskId}/results/{bufferId}/{token}")
@Produces(TRINO_PAGES)
public void getResults(@PathParam("taskId") TaskId taskId, @PathParam("bufferId") OutputBufferId bufferId, @PathParam("token") long token, @HeaderParam(TRINO_MAX_SIZE) DataSize maxSize, @Suspended AsyncResponse asyncResponse) {
requireNonNull(taskId, "taskId is null");
requireNonNull(bufferId, "bufferId is null");
if (injectFailure(taskManager.getTraceToken(taskId), taskId, RequestType.GET_RESULTS, asyncResponse)) {
return;
}
TaskState state = taskManager.getTaskStatus(taskId).getState();
boolean taskFailed = state == TaskState.ABORTED || state == TaskState.FAILED;
long start = System.nanoTime();
ListenableFuture<BufferResult> bufferResultFuture = taskManager.getTaskResults(taskId, bufferId, token, maxSize);
Duration waitTime = randomizeWaitTime(DEFAULT_MAX_WAIT_TIME);
bufferResultFuture = addTimeout(bufferResultFuture, () -> BufferResult.emptyResults(taskManager.getTaskInstanceId(taskId), token, false), waitTime, timeoutExecutor);
ListenableFuture<Response> responseFuture = Futures.transform(bufferResultFuture, result -> {
List<Slice> serializedPages = result.getSerializedPages();
GenericEntity<?> entity = null;
Status status;
if (serializedPages.isEmpty()) {
status = Status.NO_CONTENT;
} else {
entity = new GenericEntity<>(serializedPages, new TypeToken<List<Slice>>() {
}.getType());
status = Status.OK;
}
return Response.status(status).entity(entity).header(TRINO_TASK_INSTANCE_ID, result.getTaskInstanceId()).header(TRINO_PAGE_TOKEN, result.getToken()).header(TRINO_PAGE_NEXT_TOKEN, result.getNextToken()).header(TRINO_BUFFER_COMPLETE, result.isBufferComplete()).header(TRINO_TASK_FAILED, taskFailed).build();
}, directExecutor());
// For hard timeout, add an additional time to max wait for thread scheduling contention and GC
Duration timeout = new Duration(waitTime.toMillis() + ADDITIONAL_WAIT_TIME.toMillis(), MILLISECONDS);
bindAsyncResponse(asyncResponse, responseFuture, responseExecutor).withTimeout(timeout, Response.status(Status.NO_CONTENT).header(TRINO_TASK_INSTANCE_ID, taskManager.getTaskInstanceId(taskId)).header(TRINO_PAGE_TOKEN, token).header(TRINO_PAGE_NEXT_TOKEN, token).header(TRINO_BUFFER_COMPLETE, false).header(TRINO_TASK_FAILED, taskFailed).build());
responseFuture.addListener(() -> readFromOutputBufferTime.add(Duration.nanosSince(start)), directExecutor());
asyncResponse.register((CompletionCallback) throwable -> resultsRequestTime.add(Duration.nanosSince(start)));
}
use of io.trino.execution.TaskState in project trino by trinodb.
the class PipelinedStageExecution method updateTaskStatus.
private synchronized void updateTaskStatus(TaskStatus taskStatus) {
State stageState = stateMachine.getState();
if (stageState.isDone()) {
return;
}
TaskState taskState = taskStatus.getState();
switch(taskState) {
case FAILED:
RuntimeException failure = taskStatus.getFailures().stream().findFirst().map(this::rewriteTransportFailure).map(ExecutionFailureInfo::toException).orElse(new TrinoException(GENERIC_INTERNAL_ERROR, "A task failed for an unknown reason"));
fail(failure);
break;
case CANCELED:
// A task should only be in the canceled state if the STAGE is cancelled
fail(new TrinoException(GENERIC_INTERNAL_ERROR, "A task is in the CANCELED state but stage is " + stageState));
break;
case ABORTED:
// A task should only be in the aborted state if the STAGE is done (ABORTED or FAILED)
fail(new TrinoException(GENERIC_INTERNAL_ERROR, "A task is in the ABORTED state but stage is " + stageState));
break;
case FLUSHING:
flushingTasks.add(taskStatus.getTaskId());
break;
case FINISHED:
finishedTasks.add(taskStatus.getTaskId());
flushingTasks.remove(taskStatus.getTaskId());
break;
default:
}
if (stageState == SCHEDULED || stageState == RUNNING || stageState == FLUSHING) {
if (taskState == TaskState.RUNNING) {
stateMachine.transitionToRunning();
}
if (isFlushing()) {
stateMachine.transitionToFlushing();
}
if (finishedTasks.containsAll(allTasks)) {
stateMachine.transitionToFinished();
}
}
}
Aggregations