Search in sources :

Example 1 with RemoteTask

use of io.trino.execution.RemoteTask in project trino by trinodb.

the class FaultTolerantStageScheduler method schedule.

public synchronized void schedule() throws Exception {
    if (failure != null) {
        propagateIfPossible(failure, Exception.class);
        throw new RuntimeException(failure);
    }
    if (closed) {
        return;
    }
    if (isFinished()) {
        return;
    }
    if (!blocked.isDone()) {
        return;
    }
    if (taskSource == null) {
        Map<PlanFragmentId, ListenableFuture<List<ExchangeSourceHandle>>> sourceHandles = sourceExchanges.entrySet().stream().collect(toImmutableMap(Map.Entry::getKey, entry -> toListenableFuture(entry.getValue().getSourceHandles())));
        List<ListenableFuture<List<ExchangeSourceHandle>>> blockedFutures = sourceHandles.values().stream().filter(future -> !future.isDone()).collect(toImmutableList());
        if (!blockedFutures.isEmpty()) {
            blocked = asVoid(allAsList(blockedFutures));
            return;
        }
        Multimap<PlanFragmentId, ExchangeSourceHandle> exchangeSources = sourceHandles.entrySet().stream().collect(flatteningToImmutableListMultimap(Map.Entry::getKey, entry -> getFutureValue(entry.getValue()).stream()));
        taskSource = taskSourceFactory.create(session, stage.getFragment(), sourceExchanges, exchangeSources, stage::recordGetSplitTime, sourceBucketToPartitionMap, sourceBucketNodeMap);
    }
    while (!queuedPartitions.isEmpty() || !taskSource.isFinished()) {
        while (queuedPartitions.isEmpty() && !taskSource.isFinished()) {
            List<TaskDescriptor> tasks = taskSource.getMoreTasks();
            for (TaskDescriptor task : tasks) {
                queuedPartitions.add(task.getPartitionId());
                allPartitions.add(task.getPartitionId());
                taskDescriptorStorage.put(stage.getStageId(), task);
                sinkExchange.ifPresent(exchange -> {
                    ExchangeSinkHandle exchangeSinkHandle = exchange.addSink(task.getPartitionId());
                    partitionToExchangeSinkHandleMap.put(task.getPartitionId(), exchangeSinkHandle);
                });
            }
            if (taskSource.isFinished()) {
                sinkExchange.ifPresent(Exchange::noMoreSinks);
            }
        }
        if (queuedPartitions.isEmpty()) {
            break;
        }
        int partition = queuedPartitions.peek();
        Optional<TaskDescriptor> taskDescriptorOptional = taskDescriptorStorage.get(stage.getStageId(), partition);
        if (taskDescriptorOptional.isEmpty()) {
            // query has been terminated
            return;
        }
        TaskDescriptor taskDescriptor = taskDescriptorOptional.get();
        MemoryRequirements memoryRequirements = partitionMemoryRequirements.computeIfAbsent(partition, ignored -> partitionMemoryEstimator.getInitialMemoryRequirements(session, taskDescriptor.getNodeRequirements().getMemory()));
        if (nodeLease == null) {
            NodeRequirements nodeRequirements = taskDescriptor.getNodeRequirements();
            nodeRequirements = nodeRequirements.withMemory(memoryRequirements.getRequiredMemory());
            nodeLease = nodeAllocator.acquire(nodeRequirements);
        }
        if (!nodeLease.getNode().isDone()) {
            blocked = asVoid(nodeLease.getNode());
            return;
        }
        NodeInfo node = getFutureValue(nodeLease.getNode());
        queuedPartitions.poll();
        Multimap<PlanNodeId, Split> tableScanSplits = taskDescriptor.getSplits();
        Multimap<PlanNodeId, Split> remoteSplits = createRemoteSplits(taskDescriptor.getExchangeSourceHandles());
        Multimap<PlanNodeId, Split> taskSplits = ImmutableListMultimap.<PlanNodeId, Split>builder().putAll(tableScanSplits).putAll(remoteSplits).build();
        int attemptId = getNextAttemptIdForPartition(partition);
        OutputBuffers outputBuffers;
        Optional<ExchangeSinkInstanceHandle> exchangeSinkInstanceHandle;
        if (sinkExchange.isPresent()) {
            ExchangeSinkHandle sinkHandle = partitionToExchangeSinkHandleMap.get(partition);
            exchangeSinkInstanceHandle = Optional.of(sinkExchange.get().instantiateSink(sinkHandle, attemptId));
            outputBuffers = createSpoolingExchangeOutputBuffers(exchangeSinkInstanceHandle.get());
        } else {
            exchangeSinkInstanceHandle = Optional.empty();
            // stage will be consumed by the coordinator using direct exchange
            outputBuffers = createInitialEmptyOutputBuffers(PARTITIONED).withBuffer(new OutputBuffers.OutputBufferId(0), 0).withNoMoreBufferIds();
        }
        Set<PlanNodeId> allSourcePlanNodeIds = ImmutableSet.<PlanNodeId>builder().addAll(stage.getFragment().getPartitionedSources()).addAll(stage.getFragment().getRemoteSourceNodes().stream().map(RemoteSourceNode::getId).iterator()).build();
        RemoteTask task = stage.createTask(node.getNode(), partition, attemptId, sinkBucketToPartitionMap, outputBuffers, taskSplits, allSourcePlanNodeIds.stream().collect(toImmutableListMultimap(Function.identity(), planNodeId -> Lifespan.taskWide())), allSourcePlanNodeIds).orElseThrow(() -> new VerifyException("stage execution is expected to be active"));
        partitionToRemoteTaskMap.put(partition, task);
        runningTasks.put(task.getTaskId(), task);
        runningNodes.put(task.getTaskId(), nodeLease);
        nodeLease = null;
        if (taskFinishedFuture == null) {
            taskFinishedFuture = SettableFuture.create();
        }
        taskLifecycleListener.taskCreated(stage.getFragment().getId(), task);
        task.addStateChangeListener(taskStatus -> updateTaskStatus(taskStatus, exchangeSinkInstanceHandle));
        task.start();
    }
    if (taskFinishedFuture != null && !taskFinishedFuture.isDone()) {
        blocked = taskFinishedFuture;
    }
}
Also used : ArrayListMultimap(com.google.common.collect.ArrayListMultimap) SettableFuture(com.google.common.util.concurrent.SettableFuture) RemoteSourceNode(io.trino.sql.planner.plan.RemoteSourceNode) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Throwables.propagateIfPossible(com.google.common.base.Throwables.propagateIfPossible) ImmutableListMultimap.toImmutableListMultimap(com.google.common.collect.ImmutableListMultimap.toImmutableListMultimap) MemoryRequirements(io.trino.execution.scheduler.PartitionMemoryEstimator.MemoryRequirements) PlanNodeId(io.trino.sql.planner.plan.PlanNodeId) Map(java.util.Map) SpoolingExchangeInput(io.trino.split.RemoteSplit.SpoolingExchangeInput) REMOTE_HOST_GONE(io.trino.spi.StandardErrorCode.REMOTE_HOST_GONE) Futures.immediateVoidFuture(com.google.common.util.concurrent.Futures.immediateVoidFuture) ImmutableSet(com.google.common.collect.ImmutableSet) ExchangeSinkInstanceHandle(io.trino.spi.exchange.ExchangeSinkInstanceHandle) OutputBuffers.createSpoolingExchangeOutputBuffers(io.trino.execution.buffer.OutputBuffers.createSpoolingExchangeOutputBuffers) ImmutableMap(com.google.common.collect.ImmutableMap) ExecutionFailureInfo(io.trino.execution.ExecutionFailureInfo) Futures.allAsList(com.google.common.util.concurrent.Futures.allAsList) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) MoreFutures.toListenableFuture(io.airlift.concurrent.MoreFutures.toListenableFuture) Set(java.util.Set) TrinoException(io.trino.spi.TrinoException) GONE(io.trino.failuredetector.FailureDetector.State.GONE) GuardedBy(javax.annotation.concurrent.GuardedBy) TaskId(io.trino.execution.TaskId) ExchangeSinkHandle(io.trino.spi.exchange.ExchangeSinkHandle) List(java.util.List) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) Split(io.trino.metadata.Split) ImmutableListMultimap(com.google.common.collect.ImmutableListMultimap) Optional(java.util.Optional) Queue(java.util.Queue) PlanFragmentId(io.trino.sql.planner.plan.PlanFragmentId) OutputBuffers.createInitialEmptyOutputBuffers(io.trino.execution.buffer.OutputBuffers.createInitialEmptyOutputBuffers) ExchangeSourceHandle(io.trino.spi.exchange.ExchangeSourceHandle) Session(io.trino.Session) ImmutableListMultimap.flatteningToImmutableListMultimap(com.google.common.collect.ImmutableListMultimap.flatteningToImmutableListMultimap) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) StageId(io.trino.execution.StageId) Logger(io.airlift.log.Logger) HashMap(java.util.HashMap) Multimap(com.google.common.collect.Multimap) ErrorCode(io.trino.spi.ErrorCode) Function(java.util.function.Function) Failures.toFailure(io.trino.util.Failures.toFailure) RemoteSplit(io.trino.split.RemoteSplit) HashSet(java.util.HashSet) ImmutableList(com.google.common.collect.ImmutableList) Verify.verify(com.google.common.base.Verify.verify) USER_ERROR(io.trino.spi.ErrorType.USER_ERROR) Objects.requireNonNull(java.util.Objects.requireNonNull) TaskState(io.trino.execution.TaskState) Lifespan(io.trino.execution.Lifespan) Exchange(io.trino.spi.exchange.Exchange) VerifyException(com.google.common.base.VerifyException) SqlStage(io.trino.execution.SqlStage) FailureDetector(io.trino.failuredetector.FailureDetector) RemoteTask(io.trino.execution.RemoteTask) TaskStatus(io.trino.execution.TaskStatus) MoreFutures.getFutureValue(io.airlift.concurrent.MoreFutures.getFutureValue) GENERIC_INTERNAL_ERROR(io.trino.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR) MoreFutures.asVoid(io.airlift.concurrent.MoreFutures.asVoid) PARTITIONED(io.trino.execution.buffer.OutputBuffers.BufferType.PARTITIONED) Futures.nonCancellationPropagating(com.google.common.util.concurrent.Futures.nonCancellationPropagating) OutputBuffers(io.trino.execution.buffer.OutputBuffers) ArrayDeque(java.util.ArrayDeque) REMOTE_CONNECTOR_ID(io.trino.operator.ExchangeOperator.REMOTE_CONNECTOR_ID) MemoryRequirements(io.trino.execution.scheduler.PartitionMemoryEstimator.MemoryRequirements) ExchangeSourceHandle(io.trino.spi.exchange.ExchangeSourceHandle) PlanNodeId(io.trino.sql.planner.plan.PlanNodeId) OutputBuffers.createSpoolingExchangeOutputBuffers(io.trino.execution.buffer.OutputBuffers.createSpoolingExchangeOutputBuffers) OutputBuffers.createInitialEmptyOutputBuffers(io.trino.execution.buffer.OutputBuffers.createInitialEmptyOutputBuffers) OutputBuffers(io.trino.execution.buffer.OutputBuffers) PlanFragmentId(io.trino.sql.planner.plan.PlanFragmentId) ExchangeSinkInstanceHandle(io.trino.spi.exchange.ExchangeSinkInstanceHandle) RemoteTask(io.trino.execution.RemoteTask) Exchange(io.trino.spi.exchange.Exchange) ExchangeSinkHandle(io.trino.spi.exchange.ExchangeSinkHandle) VerifyException(com.google.common.base.VerifyException) MoreFutures.toListenableFuture(io.airlift.concurrent.MoreFutures.toListenableFuture) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) Split(io.trino.metadata.Split) RemoteSplit(io.trino.split.RemoteSplit) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) HashMap(java.util.HashMap)

Example 2 with RemoteTask

use of io.trino.execution.RemoteTask in project trino by trinodb.

the class FaultTolerantStageScheduler method updateTaskStatus.

private void updateTaskStatus(TaskStatus taskStatus, Optional<ExchangeSinkInstanceHandle> exchangeSinkInstanceHandle) {
    TaskState state = taskStatus.getState();
    if (!state.isDone()) {
        return;
    }
    try {
        RuntimeException failure = null;
        SettableFuture<Void> future;
        synchronized (this) {
            TaskId taskId = taskStatus.getTaskId();
            runningTasks.remove(taskId);
            future = taskFinishedFuture;
            if (!runningTasks.isEmpty()) {
                taskFinishedFuture = SettableFuture.create();
            } else {
                taskFinishedFuture = null;
            }
            NodeAllocator.NodeLease nodeLease = requireNonNull(runningNodes.remove(taskId), () -> "node not found for task id: " + taskId);
            nodeLease.release();
            int partitionId = taskId.getPartitionId();
            if (!finishedPartitions.contains(partitionId) && !closed) {
                switch(state) {
                    case FINISHED:
                        finishedPartitions.add(partitionId);
                        if (sinkExchange.isPresent()) {
                            checkArgument(exchangeSinkInstanceHandle.isPresent(), "exchangeSinkInstanceHandle is expected to be present");
                            sinkExchange.get().sinkFinished(exchangeSinkInstanceHandle.get());
                        }
                        partitionToRemoteTaskMap.get(partitionId).forEach(RemoteTask::abort);
                        break;
                    case CANCELED:
                        log.debug("Task cancelled: %s", taskId);
                        break;
                    case ABORTED:
                        log.debug("Task aborted: %s", taskId);
                        break;
                    case FAILED:
                        ExecutionFailureInfo failureInfo = taskStatus.getFailures().stream().findFirst().map(this::rewriteTransportFailure).orElse(toFailure(new TrinoException(GENERIC_INTERNAL_ERROR, "A task failed for an unknown reason")));
                        log.warn(failureInfo.toException(), "Task failed: %s", taskId);
                        ErrorCode errorCode = failureInfo.getErrorCode();
                        int taskRemainingAttempts = remainingAttemptsPerTask.getOrDefault(partitionId, maxRetryAttemptsPerTask);
                        if (remainingRetryAttemptsOverall > 0 && taskRemainingAttempts > 0 && (errorCode == null || errorCode.getType() != USER_ERROR)) {
                            remainingRetryAttemptsOverall--;
                            remainingAttemptsPerTask.put(partitionId, taskRemainingAttempts - 1);
                            // update memory limits for next attempt
                            MemoryRequirements memoryLimits = partitionMemoryRequirements.get(partitionId);
                            verify(memoryLimits != null);
                            MemoryRequirements newMemoryLimits = partitionMemoryEstimator.getNextRetryMemoryRequirements(session, memoryLimits, errorCode);
                            partitionMemoryRequirements.put(partitionId, newMemoryLimits);
                            // reschedule
                            queuedPartitions.add(partitionId);
                            log.debug("Retrying partition %s for stage %s", partitionId, stage.getStageId());
                        } else {
                            failure = failureInfo.toException();
                        }
                        break;
                    default:
                        throw new IllegalArgumentException("Unexpected task state: " + state);
                }
            }
        }
        if (failure != null) {
            // must be called outside the lock
            fail(failure);
        }
        if (future != null && !future.isDone()) {
            future.set(null);
        }
    } catch (Throwable t) {
        fail(t);
    }
}
Also used : TaskId(io.trino.execution.TaskId) MemoryRequirements(io.trino.execution.scheduler.PartitionMemoryEstimator.MemoryRequirements) RemoteTask(io.trino.execution.RemoteTask) ExecutionFailureInfo(io.trino.execution.ExecutionFailureInfo) TrinoException(io.trino.spi.TrinoException) ErrorCode(io.trino.spi.ErrorCode) MoreFutures.asVoid(io.airlift.concurrent.MoreFutures.asVoid) TaskState(io.trino.execution.TaskState)

Example 3 with RemoteTask

use of io.trino.execution.RemoteTask in project trino by trinodb.

the class FixedSourcePartitionedScheduler method schedule.

@Override
public ScheduleResult schedule() {
    // schedule a task on every node in the distribution
    List<RemoteTask> newTasks = ImmutableList.of();
    if (scheduledTasks.isEmpty()) {
        ImmutableList.Builder<RemoteTask> newTasksBuilder = ImmutableList.builder();
        for (InternalNode node : nodes) {
            Optional<RemoteTask> task = stageExecution.scheduleTask(node, partitionIdAllocator.getNextId(), ImmutableMultimap.of(), ImmutableMultimap.of());
            if (task.isPresent()) {
                scheduledTasks.put(node, task.get());
                newTasksBuilder.add(task.get());
            }
        }
        newTasks = newTasksBuilder.build();
    }
    boolean allBlocked = true;
    List<ListenableFuture<Void>> blocked = new ArrayList<>();
    BlockedReason blockedReason = BlockedReason.NO_ACTIVE_DRIVER_GROUP;
    if (groupedLifespanScheduler.isPresent()) {
        // Start new driver groups on the first scheduler if necessary,
        // i.e. when previous ones have finished execution (not finished scheduling).
        // 
        // Invoke schedule method to get a new SettableFuture every time.
        // Reusing previously returned SettableFuture could lead to the ListenableFuture retaining too many listeners.
        blocked.add(groupedLifespanScheduler.get().schedule(sourceSchedulers.get(0)));
    }
    int splitsScheduled = 0;
    Iterator<SourceScheduler> schedulerIterator = sourceSchedulers.iterator();
    List<Lifespan> driverGroupsToStart = ImmutableList.of();
    boolean shouldInvokeNoMoreDriverGroups = false;
    while (schedulerIterator.hasNext()) {
        SourceScheduler sourceScheduler = schedulerIterator.next();
        for (Lifespan lifespan : driverGroupsToStart) {
            sourceScheduler.startLifespan(lifespan, partitionHandleFor(lifespan));
        }
        if (shouldInvokeNoMoreDriverGroups) {
            sourceScheduler.noMoreLifespans();
        }
        ScheduleResult schedule = sourceScheduler.schedule();
        splitsScheduled += schedule.getSplitsScheduled();
        if (schedule.getBlockedReason().isPresent()) {
            blocked.add(schedule.getBlocked());
            blockedReason = blockedReason.combineWith(schedule.getBlockedReason().get());
        } else {
            verify(schedule.getBlocked().isDone(), "blockedReason not provided when scheduler is blocked");
            allBlocked = false;
        }
        driverGroupsToStart = sourceScheduler.drainCompletedLifespans();
        if (schedule.isFinished()) {
            stageExecution.schedulingComplete(sourceScheduler.getPlanNodeId());
            schedulerIterator.remove();
            sourceScheduler.close();
            shouldInvokeNoMoreDriverGroups = true;
        } else {
            shouldInvokeNoMoreDriverGroups = false;
        }
    }
    if (allBlocked) {
        return new ScheduleResult(sourceSchedulers.isEmpty(), newTasks, whenAnyComplete(blocked), blockedReason, splitsScheduled);
    } else {
        return new ScheduleResult(sourceSchedulers.isEmpty(), newTasks, splitsScheduled);
    }
}
Also used : BlockedReason(io.trino.execution.scheduler.ScheduleResult.BlockedReason) ImmutableList(com.google.common.collect.ImmutableList) ArrayList(java.util.ArrayList) RemoteTask(io.trino.execution.RemoteTask) SourcePartitionedScheduler.newSourcePartitionedSchedulerAsSourceScheduler(io.trino.execution.scheduler.SourcePartitionedScheduler.newSourcePartitionedSchedulerAsSourceScheduler) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) InternalNode(io.trino.metadata.InternalNode) Lifespan(io.trino.execution.Lifespan)

Example 4 with RemoteTask

use of io.trino.execution.RemoteTask in project trino by trinodb.

the class PipelinedStageExecution method failTaskRemotely.

@Override
public synchronized void failTaskRemotely(TaskId taskId, Throwable failureCause) {
    RemoteTask task = requireNonNull(tasks.get(taskId.getPartitionId()), () -> "task not found: " + taskId);
    task.failRemotely(failureCause);
// not failing stage just yet; it will happen as a result of task failure
}
Also used : RemoteTask(io.trino.execution.RemoteTask)

Example 5 with RemoteTask

use of io.trino.execution.RemoteTask in project trino by trinodb.

the class PipelinedStageExecution method updateSourceTasksOutputBuffers.

private synchronized void updateSourceTasksOutputBuffers(Consumer<OutputBufferManager> updater) {
    for (PlanFragmentId sourceFragment : exchangeSources.keySet()) {
        OutputBufferManager outputBufferManager = outputBufferManagers.get(sourceFragment);
        updater.accept(outputBufferManager);
        for (RemoteTask sourceTask : sourceTasks.get(sourceFragment)) {
            sourceTask.setOutputBuffers(outputBufferManager.getOutputBuffers());
        }
    }
}
Also used : RemoteTask(io.trino.execution.RemoteTask) PlanFragmentId(io.trino.sql.planner.plan.PlanFragmentId)

Aggregations

RemoteTask (io.trino.execution.RemoteTask)26 InternalNode (io.trino.metadata.InternalNode)11 Test (org.testng.annotations.Test)11 NodeTaskMap (io.trino.execution.NodeTaskMap)9 ImmutableList (com.google.common.collect.ImmutableList)7 MockRemoteTask (io.trino.execution.MockRemoteTaskFactory.MockRemoteTask)7 PartitionedSplitsInfo (io.trino.execution.PartitionedSplitsInfo)7 PipelinedStageExecution.createPipelinedStageExecution (io.trino.execution.scheduler.PipelinedStageExecution.createPipelinedStageExecution)7 SourcePartitionedScheduler.newSourcePartitionedSchedulerAsStageScheduler (io.trino.execution.scheduler.SourcePartitionedScheduler.newSourcePartitionedSchedulerAsStageScheduler)7 Split (io.trino.metadata.Split)7 PlanFragment (io.trino.sql.planner.PlanFragment)7 ListenableFuture (com.google.common.util.concurrent.ListenableFuture)5 List (java.util.List)5 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)4 ImmutableSet (com.google.common.collect.ImmutableSet)4 Duration (io.airlift.units.Duration)4 Lifespan (io.trino.execution.Lifespan)4 TaskId (io.trino.execution.TaskId)4 Optional (java.util.Optional)4 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)3