Search in sources :

Example 1 with TaskState

use of io.trino.execution.TaskState in project trino by trinodb.

the class FaultTolerantStageScheduler method updateTaskStatus.

private void updateTaskStatus(TaskStatus taskStatus, Optional<ExchangeSinkInstanceHandle> exchangeSinkInstanceHandle) {
    TaskState state = taskStatus.getState();
    if (!state.isDone()) {
        return;
    }
    try {
        RuntimeException failure = null;
        SettableFuture<Void> future;
        synchronized (this) {
            TaskId taskId = taskStatus.getTaskId();
            runningTasks.remove(taskId);
            future = taskFinishedFuture;
            if (!runningTasks.isEmpty()) {
                taskFinishedFuture = SettableFuture.create();
            } else {
                taskFinishedFuture = null;
            }
            NodeAllocator.NodeLease nodeLease = requireNonNull(runningNodes.remove(taskId), () -> "node not found for task id: " + taskId);
            nodeLease.release();
            int partitionId = taskId.getPartitionId();
            if (!finishedPartitions.contains(partitionId) && !closed) {
                switch(state) {
                    case FINISHED:
                        finishedPartitions.add(partitionId);
                        if (sinkExchange.isPresent()) {
                            checkArgument(exchangeSinkInstanceHandle.isPresent(), "exchangeSinkInstanceHandle is expected to be present");
                            sinkExchange.get().sinkFinished(exchangeSinkInstanceHandle.get());
                        }
                        partitionToRemoteTaskMap.get(partitionId).forEach(RemoteTask::abort);
                        break;
                    case CANCELED:
                        log.debug("Task cancelled: %s", taskId);
                        break;
                    case ABORTED:
                        log.debug("Task aborted: %s", taskId);
                        break;
                    case FAILED:
                        ExecutionFailureInfo failureInfo = taskStatus.getFailures().stream().findFirst().map(this::rewriteTransportFailure).orElse(toFailure(new TrinoException(GENERIC_INTERNAL_ERROR, "A task failed for an unknown reason")));
                        log.warn(failureInfo.toException(), "Task failed: %s", taskId);
                        ErrorCode errorCode = failureInfo.getErrorCode();
                        int taskRemainingAttempts = remainingAttemptsPerTask.getOrDefault(partitionId, maxRetryAttemptsPerTask);
                        if (remainingRetryAttemptsOverall > 0 && taskRemainingAttempts > 0 && (errorCode == null || errorCode.getType() != USER_ERROR)) {
                            remainingRetryAttemptsOverall--;
                            remainingAttemptsPerTask.put(partitionId, taskRemainingAttempts - 1);
                            // update memory limits for next attempt
                            MemoryRequirements memoryLimits = partitionMemoryRequirements.get(partitionId);
                            verify(memoryLimits != null);
                            MemoryRequirements newMemoryLimits = partitionMemoryEstimator.getNextRetryMemoryRequirements(session, memoryLimits, errorCode);
                            partitionMemoryRequirements.put(partitionId, newMemoryLimits);
                            // reschedule
                            queuedPartitions.add(partitionId);
                            log.debug("Retrying partition %s for stage %s", partitionId, stage.getStageId());
                        } else {
                            failure = failureInfo.toException();
                        }
                        break;
                    default:
                        throw new IllegalArgumentException("Unexpected task state: " + state);
                }
            }
        }
        if (failure != null) {
            // must be called outside the lock
            fail(failure);
        }
        if (future != null && !future.isDone()) {
            future.set(null);
        }
    } catch (Throwable t) {
        fail(t);
    }
}
Also used : TaskId(io.trino.execution.TaskId) MemoryRequirements(io.trino.execution.scheduler.PartitionMemoryEstimator.MemoryRequirements) RemoteTask(io.trino.execution.RemoteTask) ExecutionFailureInfo(io.trino.execution.ExecutionFailureInfo) TrinoException(io.trino.spi.TrinoException) ErrorCode(io.trino.spi.ErrorCode) MoreFutures.asVoid(io.airlift.concurrent.MoreFutures.asVoid) TaskState(io.trino.execution.TaskState)

Example 2 with TaskState

use of io.trino.execution.TaskState in project trino by trinodb.

the class TaskResource method getResults.

@ResourceSecurity(INTERNAL_ONLY)
@GET
@Path("{taskId}/results/{bufferId}/{token}")
@Produces(TRINO_PAGES)
public void getResults(@PathParam("taskId") TaskId taskId, @PathParam("bufferId") OutputBufferId bufferId, @PathParam("token") long token, @HeaderParam(TRINO_MAX_SIZE) DataSize maxSize, @Suspended AsyncResponse asyncResponse) {
    requireNonNull(taskId, "taskId is null");
    requireNonNull(bufferId, "bufferId is null");
    if (injectFailure(taskManager.getTraceToken(taskId), taskId, RequestType.GET_RESULTS, asyncResponse)) {
        return;
    }
    TaskState state = taskManager.getTaskStatus(taskId).getState();
    boolean taskFailed = state == TaskState.ABORTED || state == TaskState.FAILED;
    long start = System.nanoTime();
    ListenableFuture<BufferResult> bufferResultFuture = taskManager.getTaskResults(taskId, bufferId, token, maxSize);
    Duration waitTime = randomizeWaitTime(DEFAULT_MAX_WAIT_TIME);
    bufferResultFuture = addTimeout(bufferResultFuture, () -> BufferResult.emptyResults(taskManager.getTaskInstanceId(taskId), token, false), waitTime, timeoutExecutor);
    ListenableFuture<Response> responseFuture = Futures.transform(bufferResultFuture, result -> {
        List<Slice> serializedPages = result.getSerializedPages();
        GenericEntity<?> entity = null;
        Status status;
        if (serializedPages.isEmpty()) {
            status = Status.NO_CONTENT;
        } else {
            entity = new GenericEntity<>(serializedPages, new TypeToken<List<Slice>>() {
            }.getType());
            status = Status.OK;
        }
        return Response.status(status).entity(entity).header(TRINO_TASK_INSTANCE_ID, result.getTaskInstanceId()).header(TRINO_PAGE_TOKEN, result.getToken()).header(TRINO_PAGE_NEXT_TOKEN, result.getNextToken()).header(TRINO_BUFFER_COMPLETE, result.isBufferComplete()).header(TRINO_TASK_FAILED, taskFailed).build();
    }, directExecutor());
    // For hard timeout, add an additional time to max wait for thread scheduling contention and GC
    Duration timeout = new Duration(waitTime.toMillis() + ADDITIONAL_WAIT_TIME.toMillis(), MILLISECONDS);
    bindAsyncResponse(asyncResponse, responseFuture, responseExecutor).withTimeout(timeout, Response.status(Status.NO_CONTENT).header(TRINO_TASK_INSTANCE_ID, taskManager.getTaskInstanceId(taskId)).header(TRINO_PAGE_TOKEN, token).header(TRINO_PAGE_NEXT_TOKEN, token).header(TRINO_BUFFER_COMPLETE, false).header(TRINO_TASK_FAILED, taskFailed).build());
    responseFuture.addListener(() -> readFromOutputBufferTime.add(Duration.nanosSince(start)), directExecutor());
    asyncResponse.register((CompletionCallback) throwable -> resultsRequestTime.add(Duration.nanosSince(start)));
}
Also used : Status(javax.ws.rs.core.Response.Status) TaskStatus(io.trino.execution.TaskStatus) ResourceSecurity(io.trino.server.security.ResourceSecurity) Produces(javax.ws.rs.Produces) Iterables.transform(com.google.common.collect.Iterables.transform) TRINO_PAGES(io.trino.TrinoMediaTypes.TRINO_PAGES) Path(javax.ws.rs.Path) OutputBufferId(io.trino.execution.buffer.OutputBuffers.OutputBufferId) Duration(io.airlift.units.Duration) MediaType(javax.ws.rs.core.MediaType) QueryParam(javax.ws.rs.QueryParam) Consumes(javax.ws.rs.Consumes) BoundedExecutor(io.airlift.concurrent.BoundedExecutor) DefaultValue(javax.ws.rs.DefaultValue) HeaderParam(javax.ws.rs.HeaderParam) INTERNAL_ONLY(io.trino.server.security.ResourceSecurity.AccessType.INTERNAL_ONLY) FailureInjector(io.trino.execution.FailureInjector) DELETE(javax.ws.rs.DELETE) Context(javax.ws.rs.core.Context) TRINO_PAGE_TOKEN(io.trino.server.InternalHeaders.TRINO_PAGE_TOKEN) AsyncResponse(javax.ws.rs.container.AsyncResponse) GenericEntity(javax.ws.rs.core.GenericEntity) MILLISECONDS(java.util.concurrent.TimeUnit.MILLISECONDS) TaskId(io.trino.execution.TaskId) Suspended(javax.ws.rs.container.Suspended) InjectedFailure(io.trino.execution.FailureInjector.InjectedFailure) MoreExecutors.directExecutor(com.google.common.util.concurrent.MoreExecutors.directExecutor) TRINO_TASK_INSTANCE_ID(io.trino.server.InternalHeaders.TRINO_TASK_INSTANCE_ID) SessionPropertyManager(io.trino.metadata.SessionPropertyManager) DataSize(io.airlift.units.DataSize) List(java.util.List) Response(javax.ws.rs.core.Response) CompletionCallback(javax.ws.rs.container.CompletionCallback) TaskInfo(io.trino.execution.TaskInfo) Optional(java.util.Optional) UriInfo(javax.ws.rs.core.UriInfo) TRINO_CURRENT_VERSION(io.trino.server.InternalHeaders.TRINO_CURRENT_VERSION) AsyncResponseHandler.bindAsyncResponse(io.airlift.jaxrs.AsyncResponseHandler.bindAsyncResponse) Session(io.trino.Session) Nested(org.weakref.jmx.Nested) PathParam(javax.ws.rs.PathParam) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) Slice(io.airlift.slice.Slice) TRINO_TASK_FAILED(io.trino.server.InternalHeaders.TRINO_TASK_FAILED) Logger(io.airlift.log.Logger) GET(javax.ws.rs.GET) SqlTaskManager(io.trino.execution.SqlTaskManager) TypeToken(com.google.common.reflect.TypeToken) TRINO_PAGE_NEXT_TOKEN(io.trino.server.InternalHeaders.TRINO_PAGE_NEXT_TOKEN) TRINO_MAX_SIZE(io.trino.server.InternalHeaders.TRINO_MAX_SIZE) Inject(javax.inject.Inject) BufferResult(io.trino.execution.buffer.BufferResult) ImmutableList(com.google.common.collect.ImmutableList) Managed(org.weakref.jmx.Managed) TRINO_MAX_WAIT(io.trino.server.InternalHeaders.TRINO_MAX_WAIT) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) Objects.requireNonNull(java.util.Objects.requireNonNull) TaskState(io.trino.execution.TaskState) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) MoreFutures.addTimeout(io.airlift.concurrent.MoreFutures.addTimeout) TimeStat(io.airlift.stats.TimeStat) Status(javax.ws.rs.core.Response.Status) TRINO_BUFFER_COMPLETE(io.trino.server.InternalHeaders.TRINO_BUFFER_COMPLETE) POST(javax.ws.rs.POST) Executor(java.util.concurrent.Executor) TaskStatus(io.trino.execution.TaskStatus) Futures(com.google.common.util.concurrent.Futures) SECONDS(java.util.concurrent.TimeUnit.SECONDS) Duration(io.airlift.units.Duration) AsyncResponse(javax.ws.rs.container.AsyncResponse) Response(javax.ws.rs.core.Response) AsyncResponseHandler.bindAsyncResponse(io.airlift.jaxrs.AsyncResponseHandler.bindAsyncResponse) BufferResult(io.trino.execution.buffer.BufferResult) Slice(io.airlift.slice.Slice) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) TaskState(io.trino.execution.TaskState) Path(javax.ws.rs.Path) Produces(javax.ws.rs.Produces) GET(javax.ws.rs.GET) ResourceSecurity(io.trino.server.security.ResourceSecurity)

Example 3 with TaskState

use of io.trino.execution.TaskState in project trino by trinodb.

the class PipelinedStageExecution method updateTaskStatus.

private synchronized void updateTaskStatus(TaskStatus taskStatus) {
    State stageState = stateMachine.getState();
    if (stageState.isDone()) {
        return;
    }
    TaskState taskState = taskStatus.getState();
    switch(taskState) {
        case FAILED:
            RuntimeException failure = taskStatus.getFailures().stream().findFirst().map(this::rewriteTransportFailure).map(ExecutionFailureInfo::toException).orElse(new TrinoException(GENERIC_INTERNAL_ERROR, "A task failed for an unknown reason"));
            fail(failure);
            break;
        case CANCELED:
            // A task should only be in the canceled state if the STAGE is cancelled
            fail(new TrinoException(GENERIC_INTERNAL_ERROR, "A task is in the CANCELED state but stage is " + stageState));
            break;
        case ABORTED:
            // A task should only be in the aborted state if the STAGE is done (ABORTED or FAILED)
            fail(new TrinoException(GENERIC_INTERNAL_ERROR, "A task is in the ABORTED state but stage is " + stageState));
            break;
        case FLUSHING:
            flushingTasks.add(taskStatus.getTaskId());
            break;
        case FINISHED:
            finishedTasks.add(taskStatus.getTaskId());
            flushingTasks.remove(taskStatus.getTaskId());
            break;
        default:
    }
    if (stageState == SCHEDULED || stageState == RUNNING || stageState == FLUSHING) {
        if (taskState == TaskState.RUNNING) {
            stateMachine.transitionToRunning();
        }
        if (isFlushing()) {
            stateMachine.transitionToFlushing();
        }
        if (finishedTasks.containsAll(allTasks)) {
            stateMachine.transitionToFinished();
        }
    }
}
Also used : Preconditions.checkState(com.google.common.base.Preconditions.checkState) TaskState(io.trino.execution.TaskState) TrinoException(io.trino.spi.TrinoException) TaskState(io.trino.execution.TaskState)

Aggregations

TaskState (io.trino.execution.TaskState)3 TaskId (io.trino.execution.TaskId)2 TrinoException (io.trino.spi.TrinoException)2 Preconditions.checkState (com.google.common.base.Preconditions.checkState)1 ImmutableList (com.google.common.collect.ImmutableList)1 Iterables.transform (com.google.common.collect.Iterables.transform)1 TypeToken (com.google.common.reflect.TypeToken)1 Futures (com.google.common.util.concurrent.Futures)1 ListenableFuture (com.google.common.util.concurrent.ListenableFuture)1 MoreExecutors.directExecutor (com.google.common.util.concurrent.MoreExecutors.directExecutor)1 BoundedExecutor (io.airlift.concurrent.BoundedExecutor)1 MoreFutures.addTimeout (io.airlift.concurrent.MoreFutures.addTimeout)1 MoreFutures.asVoid (io.airlift.concurrent.MoreFutures.asVoid)1 AsyncResponseHandler.bindAsyncResponse (io.airlift.jaxrs.AsyncResponseHandler.bindAsyncResponse)1 Logger (io.airlift.log.Logger)1 Slice (io.airlift.slice.Slice)1 TimeStat (io.airlift.stats.TimeStat)1 DataSize (io.airlift.units.DataSize)1 Duration (io.airlift.units.Duration)1 Session (io.trino.Session)1