use of com.facebook.presto.execution.StageId in project presto by prestodb.
the class PrestoSparkQueryExecutionFactory method createStageInfo.
private static StageInfo createStageInfo(QueryId queryId, SubPlan plan, ListMultimap<PlanFragmentId, TaskInfo> taskInfoMap) {
PlanFragmentId planFragmentId = plan.getFragment().getId();
StageId stageId = new StageId(queryId, planFragmentId.getId());
List<TaskInfo> taskInfos = taskInfoMap.get(planFragmentId);
long peakUserMemoryReservationInBytes = 0;
long peakNodeTotalMemoryReservationInBytes = 0;
for (TaskInfo taskInfo : taskInfos) {
long taskPeakUserMemoryInBytes = taskInfo.getStats().getUserMemoryReservationInBytes();
peakUserMemoryReservationInBytes += taskPeakUserMemoryInBytes;
peakNodeTotalMemoryReservationInBytes = max(taskInfo.getStats().getPeakNodeTotalMemoryInBytes(), peakNodeTotalMemoryReservationInBytes);
}
StageExecutionInfo stageExecutionInfo = StageExecutionInfo.create(new StageExecutionId(stageId, 0), // TODO: figure out a way to know what exactly stage has caused a failure
StageExecutionState.FINISHED, Optional.empty(), taskInfos, DateTime.now(), new Distribution().snapshot(), succinctBytes(peakUserMemoryReservationInBytes), succinctBytes(peakNodeTotalMemoryReservationInBytes), 1, 1);
return new StageInfo(stageId, URI.create("http://fake.invalid/stage/" + stageId), Optional.of(plan.getFragment()), stageExecutionInfo, ImmutableList.of(), plan.getChildren().stream().map(child -> createStageInfo(queryId, child, taskInfoMap)).collect(toImmutableList()), false);
}
use of com.facebook.presto.execution.StageId in project presto by prestodb.
the class PrestoSparkTaskExecutorFactory method doCreate.
public <T extends PrestoSparkTaskOutput> IPrestoSparkTaskExecutor<T> doCreate(int partitionId, int attemptNumber, SerializedPrestoSparkTaskDescriptor serializedTaskDescriptor, Iterator<SerializedPrestoSparkTaskSource> serializedTaskSources, PrestoSparkTaskInputs inputs, CollectionAccumulator<SerializedTaskInfo> taskInfoCollector, CollectionAccumulator<PrestoSparkShuffleStats> shuffleStatsCollector, Class<T> outputType) {
PrestoSparkTaskDescriptor taskDescriptor = taskDescriptorJsonCodec.fromJson(serializedTaskDescriptor.getBytes());
ImmutableMap.Builder<String, TokenAuthenticator> extraAuthenticators = ImmutableMap.builder();
authenticatorProviders.forEach(provider -> extraAuthenticators.putAll(provider.getTokenAuthenticators()));
Session session = taskDescriptor.getSession().toSession(sessionPropertyManager, taskDescriptor.getExtraCredentials(), extraAuthenticators.build());
PlanFragment fragment = taskDescriptor.getFragment();
StageId stageId = new StageId(session.getQueryId(), fragment.getId().getId());
// Clear the cache if the cache does not have broadcast table for current stageId.
// We will only cache 1 HT at any time. If the stageId changes, we will drop the old cached HT
prestoSparkBroadcastTableCacheManager.removeCachedTablesForStagesOtherThan(stageId);
// TODO: include attemptId in taskId
TaskId taskId = new TaskId(new StageExecutionId(stageId, 0), partitionId);
List<TaskSource> taskSources = getTaskSources(serializedTaskSources);
log.info("Task [%s] received %d splits.", taskId, taskSources.stream().mapToInt(taskSource -> taskSource.getSplits().size()).sum());
OptionalLong totalSplitSize = computeAllSplitsSize(taskSources);
if (totalSplitSize.isPresent()) {
log.info("Total split size: %s bytes.", totalSplitSize.getAsLong());
}
// TODO: Remove this once we can display the plan on Spark UI.
log.info(PlanPrinter.textPlanFragment(fragment, functionAndTypeManager, session, true));
DataSize maxUserMemory = new DataSize(min(nodeMemoryConfig.getMaxQueryMemoryPerNode().toBytes(), getQueryMaxMemoryPerNode(session).toBytes()), BYTE);
DataSize maxTotalMemory = new DataSize(min(nodeMemoryConfig.getMaxQueryTotalMemoryPerNode().toBytes(), getQueryMaxTotalMemoryPerNode(session).toBytes()), BYTE);
DataSize maxBroadcastMemory = getSparkBroadcastJoinMaxMemoryOverride(session);
if (maxBroadcastMemory == null) {
maxBroadcastMemory = new DataSize(min(nodeMemoryConfig.getMaxQueryBroadcastMemory().toBytes(), getQueryMaxBroadcastMemory(session).toBytes()), BYTE);
}
MemoryPool memoryPool = new MemoryPool(new MemoryPoolId("spark-executor-memory-pool"), maxTotalMemory);
SpillSpaceTracker spillSpaceTracker = new SpillSpaceTracker(maxQuerySpillPerNode);
QueryContext queryContext = new QueryContext(session.getQueryId(), maxUserMemory, maxTotalMemory, maxBroadcastMemory, maxRevocableMemory, memoryPool, new TestingGcMonitor(), notificationExecutor, yieldExecutor, maxQuerySpillPerNode, spillSpaceTracker, memoryReservationSummaryJsonCodec);
queryContext.setVerboseExceededMemoryLimitErrorsEnabled(isVerboseExceededMemoryLimitErrorsEnabled(session));
queryContext.setHeapDumpOnExceededMemoryLimitEnabled(isHeapDumpOnExceededMemoryLimitEnabled(session));
String heapDumpFilePath = Paths.get(getHeapDumpFileDirectory(session), format("%s_%s.hprof", session.getQueryId().getId(), stageId.getId())).toString();
queryContext.setHeapDumpFilePath(heapDumpFilePath);
TaskStateMachine taskStateMachine = new TaskStateMachine(taskId, notificationExecutor);
TaskContext taskContext = queryContext.addTaskContext(taskStateMachine, session, // Plan has to be retained only if verbose memory exceeded errors are requested
isVerboseExceededMemoryLimitErrorsEnabled(session) ? Optional.of(fragment.getRoot()) : Optional.empty(), perOperatorCpuTimerEnabled, cpuTimerEnabled, perOperatorAllocationTrackingEnabled, allocationTrackingEnabled, false);
final double memoryRevokingThreshold = getMemoryRevokingThreshold(session);
final double memoryRevokingTarget = getMemoryRevokingTarget(session);
checkArgument(memoryRevokingTarget <= memoryRevokingThreshold, "memoryRevokingTarget should be less than or equal memoryRevokingThreshold, but got %s and %s respectively", memoryRevokingTarget, memoryRevokingThreshold);
if (isSpillEnabled(session)) {
memoryPool.addListener((pool, queryId, totalMemoryReservationBytes) -> {
if (totalMemoryReservationBytes > queryContext.getPeakNodeTotalMemory()) {
queryContext.setPeakNodeTotalMemory(totalMemoryReservationBytes);
}
if (totalMemoryReservationBytes > pool.getMaxBytes() * memoryRevokingThreshold && memoryRevokeRequestInProgress.compareAndSet(false, true)) {
memoryRevocationExecutor.execute(() -> {
try {
AtomicLong remainingBytesToRevoke = new AtomicLong(totalMemoryReservationBytes - (long) (memoryRevokingTarget * pool.getMaxBytes()));
remainingBytesToRevoke.addAndGet(-MemoryRevokingSchedulerUtils.getMemoryAlreadyBeingRevoked(ImmutableList.of(taskContext), remainingBytesToRevoke.get()));
taskContext.accept(new VoidTraversingQueryContextVisitor<AtomicLong>() {
@Override
public Void visitOperatorContext(OperatorContext operatorContext, AtomicLong remainingBytesToRevoke) {
if (remainingBytesToRevoke.get() > 0) {
long revokedBytes = operatorContext.requestMemoryRevoking();
if (revokedBytes > 0) {
memoryRevokePending.set(true);
remainingBytesToRevoke.addAndGet(-revokedBytes);
}
}
return null;
}
}, remainingBytesToRevoke);
memoryRevokeRequestInProgress.set(false);
} catch (Exception e) {
log.error(e, "Error requesting memory revoking");
}
});
}
// Get the latest memory reservation info since it might have changed due to revoke
long totalReservedMemory = pool.getQueryMemoryReservation(queryId) + pool.getQueryRevocableMemoryReservation(queryId);
// If total memory usage is over maxTotalMemory and memory revoke request is not pending, fail the query with EXCEEDED_MEMORY_LIMIT error
if (totalReservedMemory > maxTotalMemory.toBytes() && !memoryRevokeRequestInProgress.get() && !isMemoryRevokePending(taskContext)) {
throw exceededLocalTotalMemoryLimit(maxTotalMemory, queryContext.getAdditionalFailureInfo(totalReservedMemory, 0) + format("Total reserved memory: %s, Total revocable memory: %s", succinctBytes(pool.getQueryMemoryReservation(queryId)), succinctBytes(pool.getQueryRevocableMemoryReservation(queryId))), isHeapDumpOnExceededMemoryLimitEnabled(session), Optional.ofNullable(heapDumpFilePath));
}
});
}
ImmutableMap.Builder<PlanNodeId, List<PrestoSparkShuffleInput>> shuffleInputs = ImmutableMap.builder();
ImmutableMap.Builder<PlanNodeId, List<java.util.Iterator<PrestoSparkSerializedPage>>> pageInputs = ImmutableMap.builder();
ImmutableMap.Builder<PlanNodeId, List<?>> broadcastInputs = ImmutableMap.builder();
for (RemoteSourceNode remoteSource : fragment.getRemoteSourceNodes()) {
List<PrestoSparkShuffleInput> remoteSourceRowInputs = new ArrayList<>();
List<java.util.Iterator<PrestoSparkSerializedPage>> remoteSourcePageInputs = new ArrayList<>();
List<List<?>> broadcastInputsList = new ArrayList<>();
for (PlanFragmentId sourceFragmentId : remoteSource.getSourceFragmentIds()) {
Iterator<Tuple2<MutablePartitionId, PrestoSparkMutableRow>> shuffleInput = inputs.getShuffleInputs().get(sourceFragmentId.toString());
Broadcast<?> broadcastInput = inputs.getBroadcastInputs().get(sourceFragmentId.toString());
List<PrestoSparkSerializedPage> inMemoryInput = inputs.getInMemoryInputs().get(sourceFragmentId.toString());
if (shuffleInput != null) {
checkArgument(broadcastInput == null, "single remote source is not expected to accept different kind of inputs");
checkArgument(inMemoryInput == null, "single remote source is not expected to accept different kind of inputs");
remoteSourceRowInputs.add(new PrestoSparkShuffleInput(sourceFragmentId.getId(), shuffleInput));
continue;
}
if (broadcastInput != null) {
checkArgument(inMemoryInput == null, "single remote source is not expected to accept different kind of inputs");
// TODO: Enable NullifyingIterator once migrated to one task per JVM model
// NullifyingIterator removes element from the list upon return
// This allows GC to gradually reclaim memory
// remoteSourcePageInputs.add(getNullifyingIterator(broadcastInput.value()));
broadcastInputsList.add((List<?>) broadcastInput.value());
continue;
}
if (inMemoryInput != null) {
// for inmemory inputs pages can be released incrementally to save memory
remoteSourcePageInputs.add(getNullifyingIterator(inMemoryInput));
continue;
}
throw new IllegalArgumentException("Input not found for sourceFragmentId: " + sourceFragmentId);
}
if (!remoteSourceRowInputs.isEmpty()) {
shuffleInputs.put(remoteSource.getId(), remoteSourceRowInputs);
}
if (!remoteSourcePageInputs.isEmpty()) {
pageInputs.put(remoteSource.getId(), remoteSourcePageInputs);
}
if (!broadcastInputsList.isEmpty()) {
broadcastInputs.put(remoteSource.getId(), broadcastInputsList);
}
}
OutputBufferMemoryManager memoryManager = new OutputBufferMemoryManager(sinkMaxBufferSize.toBytes(), () -> queryContext.getTaskContextByTaskId(taskId).localSystemMemoryContext(), notificationExecutor);
Optional<OutputPartitioning> preDeterminedPartition = Optional.empty();
if (fragment.getPartitioningScheme().getPartitioning().getHandle().equals(FIXED_ARBITRARY_DISTRIBUTION)) {
int partitionCount = getHashPartitionCount(session);
preDeterminedPartition = Optional.of(new OutputPartitioning(new PreDeterminedPartitionFunction(partitionId % partitionCount, partitionCount), ImmutableList.of(), ImmutableList.of(), false, OptionalInt.empty()));
}
TempDataOperationContext tempDataOperationContext = new TempDataOperationContext(session.getSource(), session.getQueryId().getId(), session.getClientInfo(), Optional.of(session.getClientTags()), session.getIdentity());
TempStorage tempStorage = tempStorageManager.getTempStorage(storageBasedBroadcastJoinStorage);
Output<T> output = configureOutput(outputType, blockEncodingManager, memoryManager, getShuffleOutputTargetAverageRowSize(session), preDeterminedPartition, tempStorage, tempDataOperationContext, getStorageBasedBroadcastJoinWriteBufferSize(session));
PrestoSparkOutputBuffer<?> outputBuffer = output.getOutputBuffer();
LocalExecutionPlan localExecutionPlan = localExecutionPlanner.plan(taskContext, fragment.getRoot(), fragment.getPartitioningScheme(), fragment.getStageExecutionDescriptor(), fragment.getTableScanSchedulingOrder(), output.getOutputFactory(), new PrestoSparkRemoteSourceFactory(blockEncodingManager, shuffleInputs.build(), pageInputs.build(), broadcastInputs.build(), partitionId, shuffleStatsCollector, tempStorage, tempDataOperationContext, prestoSparkBroadcastTableCacheManager, stageId), taskDescriptor.getTableWriteInfo(), true);
taskStateMachine.addStateChangeListener(state -> {
if (state.isDone()) {
outputBuffer.setNoMoreRows();
}
});
PrestoSparkTaskExecution taskExecution = new PrestoSparkTaskExecution(taskStateMachine, taskContext, localExecutionPlan, taskExecutor, splitMonitor, notificationExecutor, memoryUpdateExecutor);
taskExecution.start(taskSources);
return new PrestoSparkTaskExecutor<>(taskContext, taskStateMachine, output.getOutputSupplier(), taskInfoCodec, taskInfoCollector, shuffleStatsCollector, executionExceptionFactory, output.getOutputBufferType(), outputBuffer, tempStorage, tempDataOperationContext);
}
use of com.facebook.presto.execution.StageId in project presto by prestodb.
the class LegacySqlQueryScheduler method schedule.
private void schedule() {
if (!scheduling.compareAndSet(false, true)) {
// still scheduling the previous batch of stages
return;
}
List<StageExecutionAndScheduler> scheduledStageExecutions = new ArrayList<>();
try (SetThreadName ignored = new SetThreadName("Query-%s", queryStateMachine.getQueryId())) {
Set<StageId> completedStages = new HashSet<>();
List<ExecutionSchedule> sectionExecutionSchedules = new LinkedList<>();
while (!Thread.currentThread().isInterrupted()) {
// remove finished section
sectionExecutionSchedules.removeIf(ExecutionSchedule::isFinished);
// try to pull more section that are ready to be run
List<StreamingPlanSection> sectionsReadyForExecution = getSectionsReadyForExecution();
// all finished
if (sectionsReadyForExecution.isEmpty() && sectionExecutionSchedules.isEmpty()) {
break;
}
List<List<StageExecutionAndScheduler>> sectionStageExecutions = getStageExecutions(sectionsReadyForExecution);
sectionStageExecutions.forEach(scheduledStageExecutions::addAll);
sectionStageExecutions.stream().map(executionInfos -> executionInfos.stream().collect(toImmutableList())).map(executionPolicy::createExecutionSchedule).forEach(sectionExecutionSchedules::add);
while (sectionExecutionSchedules.stream().noneMatch(ExecutionSchedule::isFinished)) {
List<ListenableFuture<?>> blockedStages = new ArrayList<>();
List<StageExecutionAndScheduler> executionsToSchedule = sectionExecutionSchedules.stream().flatMap(schedule -> schedule.getStagesToSchedule().stream()).collect(toImmutableList());
for (StageExecutionAndScheduler stageExecutionAndScheduler : executionsToSchedule) {
SqlStageExecution stageExecution = stageExecutionAndScheduler.getStageExecution();
StageId stageId = stageExecution.getStageExecutionId().getStageId();
stageExecution.beginScheduling();
// perform some scheduling work
ScheduleResult result = stageExecutionAndScheduler.getStageScheduler().schedule();
// Track leaf tasks if partial results are enabled
if (isPartialResultsEnabled(session) && stageExecutionAndScheduler.getStageExecution().getFragment().isLeaf()) {
for (RemoteTask task : result.getNewTasks()) {
partialResultQueryTaskTracker.trackTask(task);
task.addFinalTaskInfoListener(partialResultQueryTaskTracker::recordTaskFinish);
}
}
// modify parent and children based on the results of the scheduling
if (result.isFinished()) {
stageExecution.schedulingComplete();
} else if (!result.getBlocked().isDone()) {
blockedStages.add(result.getBlocked());
}
stageExecutionAndScheduler.getStageLinkage().processScheduleResults(stageExecution.getState(), result.getNewTasks());
schedulerStats.getSplitsScheduledPerIteration().add(result.getSplitsScheduled());
if (result.getBlockedReason().isPresent()) {
switch(result.getBlockedReason().get()) {
case WRITER_SCALING:
// no-op
break;
case WAITING_FOR_SOURCE:
schedulerStats.getWaitingForSource().update(1);
break;
case SPLIT_QUEUES_FULL:
schedulerStats.getSplitQueuesFull().update(1);
break;
case MIXED_SPLIT_QUEUES_FULL_AND_WAITING_FOR_SOURCE:
schedulerStats.getMixedSplitQueuesFullAndWaitingForSource().update(1);
break;
case NO_ACTIVE_DRIVER_GROUP:
schedulerStats.getNoActiveDriverGroup().update(1);
break;
default:
throw new UnsupportedOperationException("Unknown blocked reason: " + result.getBlockedReason().get());
}
}
}
// make sure to update stage linkage at least once per loop to catch async state changes (e.g., partial cancel)
boolean stageFinishedExecution = false;
for (StageExecutionAndScheduler stageExecutionInfo : scheduledStageExecutions) {
SqlStageExecution stageExecution = stageExecutionInfo.getStageExecution();
StageId stageId = stageExecution.getStageExecutionId().getStageId();
if (!completedStages.contains(stageId) && stageExecution.getState().isDone()) {
stageExecutionInfo.getStageLinkage().processScheduleResults(stageExecution.getState(), ImmutableSet.of());
completedStages.add(stageId);
stageFinishedExecution = true;
}
}
// if any stage has just finished execution try to pull more sections for scheduling
if (stageFinishedExecution) {
break;
}
// wait for a state change and then schedule again
if (!blockedStages.isEmpty()) {
try (TimeStat.BlockTimer timer = schedulerStats.getSleepTime().time()) {
tryGetFutureValue(whenAnyComplete(blockedStages), 1, SECONDS);
}
for (ListenableFuture<?> blockedStage : blockedStages) {
blockedStage.cancel(true);
}
}
}
}
for (StageExecutionAndScheduler stageExecutionInfo : scheduledStageExecutions) {
StageExecutionState state = stageExecutionInfo.getStageExecution().getState();
if (state != SCHEDULED && state != RUNNING && !state.isDone()) {
throw new PrestoException(GENERIC_INTERNAL_ERROR, format("Scheduling is complete, but stage execution %s is in state %s", stageExecutionInfo.getStageExecution().getStageExecutionId(), state));
}
}
scheduling.set(false);
// Inform the tracker that task scheduling has completed
partialResultQueryTaskTracker.completeTaskScheduling();
if (!getSectionsReadyForExecution().isEmpty()) {
startScheduling();
}
} catch (Throwable t) {
scheduling.set(false);
queryStateMachine.transitionToFailed(t);
throw t;
} finally {
RuntimeException closeError = new RuntimeException();
for (StageExecutionAndScheduler stageExecutionInfo : scheduledStageExecutions) {
try {
stageExecutionInfo.getStageScheduler().close();
} catch (Throwable t) {
queryStateMachine.transitionToFailed(t);
// Self-suppression not permitted
if (closeError != t) {
closeError.addSuppressed(t);
}
}
}
if (closeError.getSuppressed().length > 0) {
throw closeError;
}
}
}
use of com.facebook.presto.execution.StageId in project presto by prestodb.
the class LegacySqlQueryScheduler method updateStageExecutions.
/**
* Utility function that rebuild a StreamingPlanSection, re-create stageExecutionAndScheduler for each of its stage, and finally update the stageExecutions map.
*/
private void updateStageExecutions(StreamingPlanSection section, Map<PlanFragment, PlanFragment> oldToNewFragment) {
StreamingPlanSection newSection = new StreamingPlanSection(rewriteStreamingSubPlan(section.getPlan(), oldToNewFragment), section.getChildren());
PlanFragment sectionRootFragment = newSection.getPlan().getFragment();
Optional<int[]> bucketToPartition;
OutputBuffers outputBuffers;
ExchangeLocationsConsumer locationsConsumer;
if (isRootFragment(sectionRootFragment)) {
bucketToPartition = Optional.of(new int[1]);
outputBuffers = createInitialEmptyOutputBuffers(sectionRootFragment.getPartitioningScheme().getPartitioning().getHandle()).withBuffer(new OutputBufferId(0), BROADCAST_PARTITION_ID).withNoMoreBufferIds();
OutputBufferId rootBufferId = getOnlyElement(outputBuffers.getBuffers().keySet());
locationsConsumer = (fragmentId, tasks, noMoreExchangeLocations) -> updateQueryOutputLocations(queryStateMachine, rootBufferId, tasks, noMoreExchangeLocations);
} else {
bucketToPartition = Optional.empty();
outputBuffers = createDiscardingOutputBuffers();
locationsConsumer = (fragmentId, tasks, noMoreExchangeLocations) -> {
};
}
SectionExecution sectionExecution = sectionExecutionFactory.createSectionExecutions(session, newSection, locationsConsumer, bucketToPartition, outputBuffers, summarizeTaskInfo, remoteTaskFactory, splitSourceFactory, 0);
addStateChangeListeners(sectionExecution);
Map<StageId, StageExecutionAndScheduler> updatedStageExecutions = sectionExecution.getSectionStages().stream().collect(toImmutableMap(execution -> execution.getStageExecution().getStageExecutionId().getStageId(), identity()));
synchronized (this) {
stageExecutions.putAll(updatedStageExecutions);
}
}
use of com.facebook.presto.execution.StageId in project presto by prestodb.
the class SqlQueryScheduler method schedule.
private void schedule() {
if (!scheduling.compareAndSet(false, true)) {
// still scheduling the previous batch of stages
return;
}
List<StageExecutionAndScheduler> scheduledStageExecutions = new ArrayList<>();
try (SetThreadName ignored = new SetThreadName("Query-%s", queryStateMachine.getQueryId())) {
Set<StageId> completedStages = new HashSet<>();
List<ExecutionSchedule> executionSchedules = new LinkedList<>();
while (!Thread.currentThread().isInterrupted()) {
// remove finished section
executionSchedules.removeIf(ExecutionSchedule::isFinished);
// try to pull more section that are ready to be run
List<StreamingPlanSection> sectionsReadyForExecution = getSectionsReadyForExecution();
// all finished
if (sectionsReadyForExecution.isEmpty() && executionSchedules.isEmpty()) {
break;
}
// Apply runtime CBO on the ready sections before creating SectionExecutions.
List<SectionExecution> sectionExecutions = createStageExecutions(sectionsReadyForExecution.stream().map(this::tryCostBasedOptimize).collect(toImmutableList()));
if (queryStateMachine.isDone()) {
sectionExecutions.forEach(SectionExecution::abort);
break;
}
sectionExecutions.forEach(sectionExecution -> scheduledStageExecutions.addAll(sectionExecution.getSectionStages()));
sectionExecutions.stream().map(SectionExecution::getSectionStages).map(executionPolicy::createExecutionSchedule).forEach(executionSchedules::add);
while (!executionSchedules.isEmpty() && executionSchedules.stream().noneMatch(ExecutionSchedule::isFinished)) {
List<ListenableFuture<?>> blockedStages = new ArrayList<>();
List<StageExecutionAndScheduler> executionsToSchedule = executionSchedules.stream().flatMap(schedule -> schedule.getStagesToSchedule().stream()).collect(toImmutableList());
for (StageExecutionAndScheduler executionAndScheduler : executionsToSchedule) {
executionAndScheduler.getStageExecution().beginScheduling();
// perform some scheduling work
ScheduleResult result = executionAndScheduler.getStageScheduler().schedule();
// Track leaf tasks if partial results are enabled
if (isPartialResultsEnabled(session) && executionAndScheduler.getStageExecution().getFragment().isLeaf()) {
for (RemoteTask task : result.getNewTasks()) {
partialResultQueryTaskTracker.trackTask(task);
task.addFinalTaskInfoListener(partialResultQueryTaskTracker::recordTaskFinish);
}
}
// modify parent and children based on the results of the scheduling
if (result.isFinished()) {
executionAndScheduler.getStageExecution().schedulingComplete();
} else if (!result.getBlocked().isDone()) {
blockedStages.add(result.getBlocked());
}
executionAndScheduler.getStageLinkage().processScheduleResults(executionAndScheduler.getStageExecution().getState(), result.getNewTasks());
schedulerStats.getSplitsScheduledPerIteration().add(result.getSplitsScheduled());
if (result.getBlockedReason().isPresent()) {
switch(result.getBlockedReason().get()) {
case WRITER_SCALING:
// no-op
break;
case WAITING_FOR_SOURCE:
schedulerStats.getWaitingForSource().update(1);
break;
case SPLIT_QUEUES_FULL:
schedulerStats.getSplitQueuesFull().update(1);
break;
case MIXED_SPLIT_QUEUES_FULL_AND_WAITING_FOR_SOURCE:
schedulerStats.getMixedSplitQueuesFullAndWaitingForSource().update(1);
break;
case NO_ACTIVE_DRIVER_GROUP:
schedulerStats.getNoActiveDriverGroup().update(1);
break;
default:
throw new UnsupportedOperationException("Unknown blocked reason: " + result.getBlockedReason().get());
}
}
}
// make sure to update stage linkage at least once per loop to catch async state changes (e.g., partial cancel)
boolean stageFinishedExecution = false;
for (StageExecutionAndScheduler stageExecutionAndScheduler : scheduledStageExecutions) {
SqlStageExecution stageExecution = stageExecutionAndScheduler.getStageExecution();
StageId stageId = stageExecution.getStageExecutionId().getStageId();
if (!completedStages.contains(stageId) && stageExecution.getState().isDone()) {
stageExecutionAndScheduler.getStageLinkage().processScheduleResults(stageExecution.getState(), ImmutableSet.of());
completedStages.add(stageId);
stageFinishedExecution = true;
}
}
// if any stage has just finished execution try to pull more sections for scheduling
if (stageFinishedExecution) {
break;
}
// wait for a state change and then schedule again
if (!blockedStages.isEmpty()) {
try (TimeStat.BlockTimer timer = schedulerStats.getSleepTime().time()) {
tryGetFutureValue(whenAnyComplete(blockedStages), 1, SECONDS);
}
for (ListenableFuture<?> blockedStage : blockedStages) {
blockedStage.cancel(true);
}
}
}
}
for (StageExecutionAndScheduler stageExecutionAndScheduler : scheduledStageExecutions) {
StageExecutionState state = stageExecutionAndScheduler.getStageExecution().getState();
if (state != SCHEDULED && state != RUNNING && !state.isDone()) {
throw new PrestoException(GENERIC_INTERNAL_ERROR, format("Scheduling is complete, but stage execution %s is in state %s", stageExecutionAndScheduler.getStageExecution().getStageExecutionId(), state));
}
}
scheduling.set(false);
// Inform the tracker that task scheduling has completed
partialResultQueryTaskTracker.completeTaskScheduling();
if (!getSectionsReadyForExecution().isEmpty()) {
startScheduling();
}
} catch (Throwable t) {
scheduling.set(false);
queryStateMachine.transitionToFailed(t);
throw t;
} finally {
RuntimeException closeError = new RuntimeException();
for (StageExecutionAndScheduler stageExecutionAndScheduler : scheduledStageExecutions) {
try {
stageExecutionAndScheduler.getStageScheduler().close();
} catch (Throwable t) {
queryStateMachine.transitionToFailed(t);
// Self-suppression not permitted
if (closeError != t) {
closeError.addSuppressed(t);
}
}
}
if (closeError.getSuppressed().length > 0) {
throw closeError;
}
}
}
Aggregations