use of io.trino.execution.RemoteTask in project trino by trinodb.
the class FaultTolerantStageScheduler method schedule.
public synchronized void schedule() throws Exception {
if (failure != null) {
propagateIfPossible(failure, Exception.class);
throw new RuntimeException(failure);
}
if (closed) {
return;
}
if (isFinished()) {
return;
}
if (!blocked.isDone()) {
return;
}
if (taskSource == null) {
Map<PlanFragmentId, ListenableFuture<List<ExchangeSourceHandle>>> sourceHandles = sourceExchanges.entrySet().stream().collect(toImmutableMap(Map.Entry::getKey, entry -> toListenableFuture(entry.getValue().getSourceHandles())));
List<ListenableFuture<List<ExchangeSourceHandle>>> blockedFutures = sourceHandles.values().stream().filter(future -> !future.isDone()).collect(toImmutableList());
if (!blockedFutures.isEmpty()) {
blocked = asVoid(allAsList(blockedFutures));
return;
}
Multimap<PlanFragmentId, ExchangeSourceHandle> exchangeSources = sourceHandles.entrySet().stream().collect(flatteningToImmutableListMultimap(Map.Entry::getKey, entry -> getFutureValue(entry.getValue()).stream()));
taskSource = taskSourceFactory.create(session, stage.getFragment(), sourceExchanges, exchangeSources, stage::recordGetSplitTime, sourceBucketToPartitionMap, sourceBucketNodeMap);
}
while (!queuedPartitions.isEmpty() || !taskSource.isFinished()) {
while (queuedPartitions.isEmpty() && !taskSource.isFinished()) {
List<TaskDescriptor> tasks = taskSource.getMoreTasks();
for (TaskDescriptor task : tasks) {
queuedPartitions.add(task.getPartitionId());
allPartitions.add(task.getPartitionId());
taskDescriptorStorage.put(stage.getStageId(), task);
sinkExchange.ifPresent(exchange -> {
ExchangeSinkHandle exchangeSinkHandle = exchange.addSink(task.getPartitionId());
partitionToExchangeSinkHandleMap.put(task.getPartitionId(), exchangeSinkHandle);
});
}
if (taskSource.isFinished()) {
sinkExchange.ifPresent(Exchange::noMoreSinks);
}
}
if (queuedPartitions.isEmpty()) {
break;
}
int partition = queuedPartitions.peek();
Optional<TaskDescriptor> taskDescriptorOptional = taskDescriptorStorage.get(stage.getStageId(), partition);
if (taskDescriptorOptional.isEmpty()) {
// query has been terminated
return;
}
TaskDescriptor taskDescriptor = taskDescriptorOptional.get();
MemoryRequirements memoryRequirements = partitionMemoryRequirements.computeIfAbsent(partition, ignored -> partitionMemoryEstimator.getInitialMemoryRequirements(session, taskDescriptor.getNodeRequirements().getMemory()));
if (nodeLease == null) {
NodeRequirements nodeRequirements = taskDescriptor.getNodeRequirements();
nodeRequirements = nodeRequirements.withMemory(memoryRequirements.getRequiredMemory());
nodeLease = nodeAllocator.acquire(nodeRequirements);
}
if (!nodeLease.getNode().isDone()) {
blocked = asVoid(nodeLease.getNode());
return;
}
NodeInfo node = getFutureValue(nodeLease.getNode());
queuedPartitions.poll();
Multimap<PlanNodeId, Split> tableScanSplits = taskDescriptor.getSplits();
Multimap<PlanNodeId, Split> remoteSplits = createRemoteSplits(taskDescriptor.getExchangeSourceHandles());
Multimap<PlanNodeId, Split> taskSplits = ImmutableListMultimap.<PlanNodeId, Split>builder().putAll(tableScanSplits).putAll(remoteSplits).build();
int attemptId = getNextAttemptIdForPartition(partition);
OutputBuffers outputBuffers;
Optional<ExchangeSinkInstanceHandle> exchangeSinkInstanceHandle;
if (sinkExchange.isPresent()) {
ExchangeSinkHandle sinkHandle = partitionToExchangeSinkHandleMap.get(partition);
exchangeSinkInstanceHandle = Optional.of(sinkExchange.get().instantiateSink(sinkHandle, attemptId));
outputBuffers = createSpoolingExchangeOutputBuffers(exchangeSinkInstanceHandle.get());
} else {
exchangeSinkInstanceHandle = Optional.empty();
// stage will be consumed by the coordinator using direct exchange
outputBuffers = createInitialEmptyOutputBuffers(PARTITIONED).withBuffer(new OutputBuffers.OutputBufferId(0), 0).withNoMoreBufferIds();
}
Set<PlanNodeId> allSourcePlanNodeIds = ImmutableSet.<PlanNodeId>builder().addAll(stage.getFragment().getPartitionedSources()).addAll(stage.getFragment().getRemoteSourceNodes().stream().map(RemoteSourceNode::getId).iterator()).build();
RemoteTask task = stage.createTask(node.getNode(), partition, attemptId, sinkBucketToPartitionMap, outputBuffers, taskSplits, allSourcePlanNodeIds.stream().collect(toImmutableListMultimap(Function.identity(), planNodeId -> Lifespan.taskWide())), allSourcePlanNodeIds).orElseThrow(() -> new VerifyException("stage execution is expected to be active"));
partitionToRemoteTaskMap.put(partition, task);
runningTasks.put(task.getTaskId(), task);
runningNodes.put(task.getTaskId(), nodeLease);
nodeLease = null;
if (taskFinishedFuture == null) {
taskFinishedFuture = SettableFuture.create();
}
taskLifecycleListener.taskCreated(stage.getFragment().getId(), task);
task.addStateChangeListener(taskStatus -> updateTaskStatus(taskStatus, exchangeSinkInstanceHandle));
task.start();
}
if (taskFinishedFuture != null && !taskFinishedFuture.isDone()) {
blocked = taskFinishedFuture;
}
}
use of io.trino.execution.RemoteTask in project trino by trinodb.
the class FaultTolerantStageScheduler method updateTaskStatus.
private void updateTaskStatus(TaskStatus taskStatus, Optional<ExchangeSinkInstanceHandle> exchangeSinkInstanceHandle) {
TaskState state = taskStatus.getState();
if (!state.isDone()) {
return;
}
try {
RuntimeException failure = null;
SettableFuture<Void> future;
synchronized (this) {
TaskId taskId = taskStatus.getTaskId();
runningTasks.remove(taskId);
future = taskFinishedFuture;
if (!runningTasks.isEmpty()) {
taskFinishedFuture = SettableFuture.create();
} else {
taskFinishedFuture = null;
}
NodeAllocator.NodeLease nodeLease = requireNonNull(runningNodes.remove(taskId), () -> "node not found for task id: " + taskId);
nodeLease.release();
int partitionId = taskId.getPartitionId();
if (!finishedPartitions.contains(partitionId) && !closed) {
switch(state) {
case FINISHED:
finishedPartitions.add(partitionId);
if (sinkExchange.isPresent()) {
checkArgument(exchangeSinkInstanceHandle.isPresent(), "exchangeSinkInstanceHandle is expected to be present");
sinkExchange.get().sinkFinished(exchangeSinkInstanceHandle.get());
}
partitionToRemoteTaskMap.get(partitionId).forEach(RemoteTask::abort);
break;
case CANCELED:
log.debug("Task cancelled: %s", taskId);
break;
case ABORTED:
log.debug("Task aborted: %s", taskId);
break;
case FAILED:
ExecutionFailureInfo failureInfo = taskStatus.getFailures().stream().findFirst().map(this::rewriteTransportFailure).orElse(toFailure(new TrinoException(GENERIC_INTERNAL_ERROR, "A task failed for an unknown reason")));
log.warn(failureInfo.toException(), "Task failed: %s", taskId);
ErrorCode errorCode = failureInfo.getErrorCode();
int taskRemainingAttempts = remainingAttemptsPerTask.getOrDefault(partitionId, maxRetryAttemptsPerTask);
if (remainingRetryAttemptsOverall > 0 && taskRemainingAttempts > 0 && (errorCode == null || errorCode.getType() != USER_ERROR)) {
remainingRetryAttemptsOverall--;
remainingAttemptsPerTask.put(partitionId, taskRemainingAttempts - 1);
// update memory limits for next attempt
MemoryRequirements memoryLimits = partitionMemoryRequirements.get(partitionId);
verify(memoryLimits != null);
MemoryRequirements newMemoryLimits = partitionMemoryEstimator.getNextRetryMemoryRequirements(session, memoryLimits, errorCode);
partitionMemoryRequirements.put(partitionId, newMemoryLimits);
// reschedule
queuedPartitions.add(partitionId);
log.debug("Retrying partition %s for stage %s", partitionId, stage.getStageId());
} else {
failure = failureInfo.toException();
}
break;
default:
throw new IllegalArgumentException("Unexpected task state: " + state);
}
}
}
if (failure != null) {
// must be called outside the lock
fail(failure);
}
if (future != null && !future.isDone()) {
future.set(null);
}
} catch (Throwable t) {
fail(t);
}
}
use of io.trino.execution.RemoteTask in project trino by trinodb.
the class FixedSourcePartitionedScheduler method schedule.
@Override
public ScheduleResult schedule() {
// schedule a task on every node in the distribution
List<RemoteTask> newTasks = ImmutableList.of();
if (scheduledTasks.isEmpty()) {
ImmutableList.Builder<RemoteTask> newTasksBuilder = ImmutableList.builder();
for (InternalNode node : nodes) {
Optional<RemoteTask> task = stageExecution.scheduleTask(node, partitionIdAllocator.getNextId(), ImmutableMultimap.of(), ImmutableMultimap.of());
if (task.isPresent()) {
scheduledTasks.put(node, task.get());
newTasksBuilder.add(task.get());
}
}
newTasks = newTasksBuilder.build();
}
boolean allBlocked = true;
List<ListenableFuture<Void>> blocked = new ArrayList<>();
BlockedReason blockedReason = BlockedReason.NO_ACTIVE_DRIVER_GROUP;
if (groupedLifespanScheduler.isPresent()) {
// Start new driver groups on the first scheduler if necessary,
// i.e. when previous ones have finished execution (not finished scheduling).
//
// Invoke schedule method to get a new SettableFuture every time.
// Reusing previously returned SettableFuture could lead to the ListenableFuture retaining too many listeners.
blocked.add(groupedLifespanScheduler.get().schedule(sourceSchedulers.get(0)));
}
int splitsScheduled = 0;
Iterator<SourceScheduler> schedulerIterator = sourceSchedulers.iterator();
List<Lifespan> driverGroupsToStart = ImmutableList.of();
boolean shouldInvokeNoMoreDriverGroups = false;
while (schedulerIterator.hasNext()) {
SourceScheduler sourceScheduler = schedulerIterator.next();
for (Lifespan lifespan : driverGroupsToStart) {
sourceScheduler.startLifespan(lifespan, partitionHandleFor(lifespan));
}
if (shouldInvokeNoMoreDriverGroups) {
sourceScheduler.noMoreLifespans();
}
ScheduleResult schedule = sourceScheduler.schedule();
splitsScheduled += schedule.getSplitsScheduled();
if (schedule.getBlockedReason().isPresent()) {
blocked.add(schedule.getBlocked());
blockedReason = blockedReason.combineWith(schedule.getBlockedReason().get());
} else {
verify(schedule.getBlocked().isDone(), "blockedReason not provided when scheduler is blocked");
allBlocked = false;
}
driverGroupsToStart = sourceScheduler.drainCompletedLifespans();
if (schedule.isFinished()) {
stageExecution.schedulingComplete(sourceScheduler.getPlanNodeId());
schedulerIterator.remove();
sourceScheduler.close();
shouldInvokeNoMoreDriverGroups = true;
} else {
shouldInvokeNoMoreDriverGroups = false;
}
}
if (allBlocked) {
return new ScheduleResult(sourceSchedulers.isEmpty(), newTasks, whenAnyComplete(blocked), blockedReason, splitsScheduled);
} else {
return new ScheduleResult(sourceSchedulers.isEmpty(), newTasks, splitsScheduled);
}
}
use of io.trino.execution.RemoteTask in project trino by trinodb.
the class PipelinedStageExecution method failTaskRemotely.
@Override
public synchronized void failTaskRemotely(TaskId taskId, Throwable failureCause) {
RemoteTask task = requireNonNull(tasks.get(taskId.getPartitionId()), () -> "task not found: " + taskId);
task.failRemotely(failureCause);
// not failing stage just yet; it will happen as a result of task failure
}
use of io.trino.execution.RemoteTask in project trino by trinodb.
the class PipelinedStageExecution method updateSourceTasksOutputBuffers.
private synchronized void updateSourceTasksOutputBuffers(Consumer<OutputBufferManager> updater) {
for (PlanFragmentId sourceFragment : exchangeSources.keySet()) {
OutputBufferManager outputBufferManager = outputBufferManagers.get(sourceFragment);
updater.accept(outputBufferManager);
for (RemoteTask sourceTask : sourceTasks.get(sourceFragment)) {
sourceTask.setOutputBuffers(outputBufferManager.getOutputBuffers());
}
}
}
Aggregations