use of com.facebook.presto.execution.StageExecutionState in project presto by prestodb.
the class LegacySqlQueryScheduler method schedule.
private void schedule() {
if (!scheduling.compareAndSet(false, true)) {
// still scheduling the previous batch of stages
return;
}
List<StageExecutionAndScheduler> scheduledStageExecutions = new ArrayList<>();
try (SetThreadName ignored = new SetThreadName("Query-%s", queryStateMachine.getQueryId())) {
Set<StageId> completedStages = new HashSet<>();
List<ExecutionSchedule> sectionExecutionSchedules = new LinkedList<>();
while (!Thread.currentThread().isInterrupted()) {
// remove finished section
sectionExecutionSchedules.removeIf(ExecutionSchedule::isFinished);
// try to pull more section that are ready to be run
List<StreamingPlanSection> sectionsReadyForExecution = getSectionsReadyForExecution();
// all finished
if (sectionsReadyForExecution.isEmpty() && sectionExecutionSchedules.isEmpty()) {
break;
}
List<List<StageExecutionAndScheduler>> sectionStageExecutions = getStageExecutions(sectionsReadyForExecution);
sectionStageExecutions.forEach(scheduledStageExecutions::addAll);
sectionStageExecutions.stream().map(executionInfos -> executionInfos.stream().collect(toImmutableList())).map(executionPolicy::createExecutionSchedule).forEach(sectionExecutionSchedules::add);
while (sectionExecutionSchedules.stream().noneMatch(ExecutionSchedule::isFinished)) {
List<ListenableFuture<?>> blockedStages = new ArrayList<>();
List<StageExecutionAndScheduler> executionsToSchedule = sectionExecutionSchedules.stream().flatMap(schedule -> schedule.getStagesToSchedule().stream()).collect(toImmutableList());
for (StageExecutionAndScheduler stageExecutionAndScheduler : executionsToSchedule) {
SqlStageExecution stageExecution = stageExecutionAndScheduler.getStageExecution();
StageId stageId = stageExecution.getStageExecutionId().getStageId();
stageExecution.beginScheduling();
// perform some scheduling work
ScheduleResult result = stageExecutionAndScheduler.getStageScheduler().schedule();
// Track leaf tasks if partial results are enabled
if (isPartialResultsEnabled(session) && stageExecutionAndScheduler.getStageExecution().getFragment().isLeaf()) {
for (RemoteTask task : result.getNewTasks()) {
partialResultQueryTaskTracker.trackTask(task);
task.addFinalTaskInfoListener(partialResultQueryTaskTracker::recordTaskFinish);
}
}
// modify parent and children based on the results of the scheduling
if (result.isFinished()) {
stageExecution.schedulingComplete();
} else if (!result.getBlocked().isDone()) {
blockedStages.add(result.getBlocked());
}
stageExecutionAndScheduler.getStageLinkage().processScheduleResults(stageExecution.getState(), result.getNewTasks());
schedulerStats.getSplitsScheduledPerIteration().add(result.getSplitsScheduled());
if (result.getBlockedReason().isPresent()) {
switch(result.getBlockedReason().get()) {
case WRITER_SCALING:
// no-op
break;
case WAITING_FOR_SOURCE:
schedulerStats.getWaitingForSource().update(1);
break;
case SPLIT_QUEUES_FULL:
schedulerStats.getSplitQueuesFull().update(1);
break;
case MIXED_SPLIT_QUEUES_FULL_AND_WAITING_FOR_SOURCE:
schedulerStats.getMixedSplitQueuesFullAndWaitingForSource().update(1);
break;
case NO_ACTIVE_DRIVER_GROUP:
schedulerStats.getNoActiveDriverGroup().update(1);
break;
default:
throw new UnsupportedOperationException("Unknown blocked reason: " + result.getBlockedReason().get());
}
}
}
// make sure to update stage linkage at least once per loop to catch async state changes (e.g., partial cancel)
boolean stageFinishedExecution = false;
for (StageExecutionAndScheduler stageExecutionInfo : scheduledStageExecutions) {
SqlStageExecution stageExecution = stageExecutionInfo.getStageExecution();
StageId stageId = stageExecution.getStageExecutionId().getStageId();
if (!completedStages.contains(stageId) && stageExecution.getState().isDone()) {
stageExecutionInfo.getStageLinkage().processScheduleResults(stageExecution.getState(), ImmutableSet.of());
completedStages.add(stageId);
stageFinishedExecution = true;
}
}
// if any stage has just finished execution try to pull more sections for scheduling
if (stageFinishedExecution) {
break;
}
// wait for a state change and then schedule again
if (!blockedStages.isEmpty()) {
try (TimeStat.BlockTimer timer = schedulerStats.getSleepTime().time()) {
tryGetFutureValue(whenAnyComplete(blockedStages), 1, SECONDS);
}
for (ListenableFuture<?> blockedStage : blockedStages) {
blockedStage.cancel(true);
}
}
}
}
for (StageExecutionAndScheduler stageExecutionInfo : scheduledStageExecutions) {
StageExecutionState state = stageExecutionInfo.getStageExecution().getState();
if (state != SCHEDULED && state != RUNNING && !state.isDone()) {
throw new PrestoException(GENERIC_INTERNAL_ERROR, format("Scheduling is complete, but stage execution %s is in state %s", stageExecutionInfo.getStageExecution().getStageExecutionId(), state));
}
}
scheduling.set(false);
// Inform the tracker that task scheduling has completed
partialResultQueryTaskTracker.completeTaskScheduling();
if (!getSectionsReadyForExecution().isEmpty()) {
startScheduling();
}
} catch (Throwable t) {
scheduling.set(false);
queryStateMachine.transitionToFailed(t);
throw t;
} finally {
RuntimeException closeError = new RuntimeException();
for (StageExecutionAndScheduler stageExecutionInfo : scheduledStageExecutions) {
try {
stageExecutionInfo.getStageScheduler().close();
} catch (Throwable t) {
queryStateMachine.transitionToFailed(t);
// Self-suppression not permitted
if (closeError != t) {
closeError.addSuppressed(t);
}
}
}
if (closeError.getSuppressed().length > 0) {
throw closeError;
}
}
}
use of com.facebook.presto.execution.StageExecutionState in project presto by prestodb.
the class SqlQueryScheduler method schedule.
private void schedule() {
if (!scheduling.compareAndSet(false, true)) {
// still scheduling the previous batch of stages
return;
}
List<StageExecutionAndScheduler> scheduledStageExecutions = new ArrayList<>();
try (SetThreadName ignored = new SetThreadName("Query-%s", queryStateMachine.getQueryId())) {
Set<StageId> completedStages = new HashSet<>();
List<ExecutionSchedule> executionSchedules = new LinkedList<>();
while (!Thread.currentThread().isInterrupted()) {
// remove finished section
executionSchedules.removeIf(ExecutionSchedule::isFinished);
// try to pull more section that are ready to be run
List<StreamingPlanSection> sectionsReadyForExecution = getSectionsReadyForExecution();
// all finished
if (sectionsReadyForExecution.isEmpty() && executionSchedules.isEmpty()) {
break;
}
// Apply runtime CBO on the ready sections before creating SectionExecutions.
List<SectionExecution> sectionExecutions = createStageExecutions(sectionsReadyForExecution.stream().map(this::tryCostBasedOptimize).collect(toImmutableList()));
if (queryStateMachine.isDone()) {
sectionExecutions.forEach(SectionExecution::abort);
break;
}
sectionExecutions.forEach(sectionExecution -> scheduledStageExecutions.addAll(sectionExecution.getSectionStages()));
sectionExecutions.stream().map(SectionExecution::getSectionStages).map(executionPolicy::createExecutionSchedule).forEach(executionSchedules::add);
while (!executionSchedules.isEmpty() && executionSchedules.stream().noneMatch(ExecutionSchedule::isFinished)) {
List<ListenableFuture<?>> blockedStages = new ArrayList<>();
List<StageExecutionAndScheduler> executionsToSchedule = executionSchedules.stream().flatMap(schedule -> schedule.getStagesToSchedule().stream()).collect(toImmutableList());
for (StageExecutionAndScheduler executionAndScheduler : executionsToSchedule) {
executionAndScheduler.getStageExecution().beginScheduling();
// perform some scheduling work
ScheduleResult result = executionAndScheduler.getStageScheduler().schedule();
// Track leaf tasks if partial results are enabled
if (isPartialResultsEnabled(session) && executionAndScheduler.getStageExecution().getFragment().isLeaf()) {
for (RemoteTask task : result.getNewTasks()) {
partialResultQueryTaskTracker.trackTask(task);
task.addFinalTaskInfoListener(partialResultQueryTaskTracker::recordTaskFinish);
}
}
// modify parent and children based on the results of the scheduling
if (result.isFinished()) {
executionAndScheduler.getStageExecution().schedulingComplete();
} else if (!result.getBlocked().isDone()) {
blockedStages.add(result.getBlocked());
}
executionAndScheduler.getStageLinkage().processScheduleResults(executionAndScheduler.getStageExecution().getState(), result.getNewTasks());
schedulerStats.getSplitsScheduledPerIteration().add(result.getSplitsScheduled());
if (result.getBlockedReason().isPresent()) {
switch(result.getBlockedReason().get()) {
case WRITER_SCALING:
// no-op
break;
case WAITING_FOR_SOURCE:
schedulerStats.getWaitingForSource().update(1);
break;
case SPLIT_QUEUES_FULL:
schedulerStats.getSplitQueuesFull().update(1);
break;
case MIXED_SPLIT_QUEUES_FULL_AND_WAITING_FOR_SOURCE:
schedulerStats.getMixedSplitQueuesFullAndWaitingForSource().update(1);
break;
case NO_ACTIVE_DRIVER_GROUP:
schedulerStats.getNoActiveDriverGroup().update(1);
break;
default:
throw new UnsupportedOperationException("Unknown blocked reason: " + result.getBlockedReason().get());
}
}
}
// make sure to update stage linkage at least once per loop to catch async state changes (e.g., partial cancel)
boolean stageFinishedExecution = false;
for (StageExecutionAndScheduler stageExecutionAndScheduler : scheduledStageExecutions) {
SqlStageExecution stageExecution = stageExecutionAndScheduler.getStageExecution();
StageId stageId = stageExecution.getStageExecutionId().getStageId();
if (!completedStages.contains(stageId) && stageExecution.getState().isDone()) {
stageExecutionAndScheduler.getStageLinkage().processScheduleResults(stageExecution.getState(), ImmutableSet.of());
completedStages.add(stageId);
stageFinishedExecution = true;
}
}
// if any stage has just finished execution try to pull more sections for scheduling
if (stageFinishedExecution) {
break;
}
// wait for a state change and then schedule again
if (!blockedStages.isEmpty()) {
try (TimeStat.BlockTimer timer = schedulerStats.getSleepTime().time()) {
tryGetFutureValue(whenAnyComplete(blockedStages), 1, SECONDS);
}
for (ListenableFuture<?> blockedStage : blockedStages) {
blockedStage.cancel(true);
}
}
}
}
for (StageExecutionAndScheduler stageExecutionAndScheduler : scheduledStageExecutions) {
StageExecutionState state = stageExecutionAndScheduler.getStageExecution().getState();
if (state != SCHEDULED && state != RUNNING && !state.isDone()) {
throw new PrestoException(GENERIC_INTERNAL_ERROR, format("Scheduling is complete, but stage execution %s is in state %s", stageExecutionAndScheduler.getStageExecution().getStageExecutionId(), state));
}
}
scheduling.set(false);
// Inform the tracker that task scheduling has completed
partialResultQueryTaskTracker.completeTaskScheduling();
if (!getSectionsReadyForExecution().isEmpty()) {
startScheduling();
}
} catch (Throwable t) {
scheduling.set(false);
queryStateMachine.transitionToFailed(t);
throw t;
} finally {
RuntimeException closeError = new RuntimeException();
for (StageExecutionAndScheduler stageExecutionAndScheduler : scheduledStageExecutions) {
try {
stageExecutionAndScheduler.getStageScheduler().close();
} catch (Throwable t) {
queryStateMachine.transitionToFailed(t);
// Self-suppression not permitted
if (closeError != t) {
closeError.addSuppressed(t);
}
}
}
if (closeError.getSuppressed().length > 0) {
throw closeError;
}
}
}
use of com.facebook.presto.execution.StageExecutionState in project presto by prestodb.
the class SectionExecutionFactory method createStageScheduler.
private StageScheduler createStageScheduler(SplitSourceFactory splitSourceFactory, Session session, StreamingSubPlan plan, Function<PartitioningHandle, NodePartitionMap> partitioningCache, Optional<SqlStageExecution> parentStageExecution, StageId stageId, SqlStageExecution stageExecution, PartitioningHandle partitioningHandle, TableWriteInfo tableWriteInfo, Set<SqlStageExecution> childStageExecutions) {
Map<PlanNodeId, SplitSource> splitSources = splitSourceFactory.createSplitSources(plan.getFragment(), session, tableWriteInfo);
int maxTasksPerStage = getMaxTasksPerStage(session);
if (partitioningHandle.equals(SOURCE_DISTRIBUTION)) {
// nodes are selected dynamically based on the constraints of the splits and the system load
Map.Entry<PlanNodeId, SplitSource> entry = getOnlyElement(splitSources.entrySet());
PlanNodeId planNodeId = entry.getKey();
SplitSource splitSource = entry.getValue();
ConnectorId connectorId = splitSource.getConnectorId();
if (isInternalSystemConnector(connectorId)) {
connectorId = null;
}
NodeSelector nodeSelector = nodeScheduler.createNodeSelector(session, connectorId, maxTasksPerStage);
SplitPlacementPolicy placementPolicy = new DynamicSplitPlacementPolicy(nodeSelector, stageExecution::getAllTasks);
checkArgument(!plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution());
return newSourcePartitionedSchedulerAsStageScheduler(stageExecution, planNodeId, splitSource, placementPolicy, splitBatchSize);
} else if (partitioningHandle.equals(SCALED_WRITER_DISTRIBUTION)) {
Supplier<Collection<TaskStatus>> sourceTasksProvider = () -> childStageExecutions.stream().map(SqlStageExecution::getAllTasks).flatMap(Collection::stream).map(RemoteTask::getTaskStatus).collect(toList());
Supplier<Collection<TaskStatus>> writerTasksProvider = () -> stageExecution.getAllTasks().stream().map(RemoteTask::getTaskStatus).collect(toList());
ScaledWriterScheduler scheduler = new ScaledWriterScheduler(stageExecution, sourceTasksProvider, writerTasksProvider, nodeScheduler.createNodeSelector(session, null), scheduledExecutor, getWriterMinSize(session), isOptimizedScaleWriterProducerBuffer(session));
whenAllStages(childStageExecutions, StageExecutionState::isDone).addListener(scheduler::finish, directExecutor());
return scheduler;
} else {
if (!splitSources.isEmpty()) {
// contains local source
List<PlanNodeId> schedulingOrder = plan.getFragment().getTableScanSchedulingOrder();
ConnectorId connectorId = partitioningHandle.getConnectorId().orElseThrow(IllegalStateException::new);
List<ConnectorPartitionHandle> connectorPartitionHandles;
boolean groupedExecutionForStage = plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution();
if (groupedExecutionForStage) {
connectorPartitionHandles = nodePartitioningManager.listPartitionHandles(session, partitioningHandle);
checkState(!ImmutableList.of(NOT_PARTITIONED).equals(connectorPartitionHandles));
} else {
connectorPartitionHandles = ImmutableList.of(NOT_PARTITIONED);
}
BucketNodeMap bucketNodeMap;
List<InternalNode> stageNodeList;
if (plan.getFragment().getRemoteSourceNodes().stream().allMatch(node -> node.getExchangeType() == REPLICATE)) {
// no non-replicated remote source
boolean dynamicLifespanSchedule = plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule();
bucketNodeMap = nodePartitioningManager.getBucketNodeMap(session, partitioningHandle, dynamicLifespanSchedule);
// verify execution is consistent with planner's decision on dynamic lifespan schedule
verify(bucketNodeMap.isDynamic() == dynamicLifespanSchedule);
if (bucketNodeMap.hasInitialMap()) {
stageNodeList = bucketNodeMap.getBucketToNode().get().stream().distinct().collect(toImmutableList());
} else {
stageNodeList = new ArrayList<>(nodeScheduler.createNodeSelector(session, connectorId).selectRandomNodes(maxTasksPerStage));
}
} else {
// cannot use dynamic lifespan schedule
verify(!plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule());
// remote source requires nodePartitionMap
NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning());
if (groupedExecutionForStage) {
checkState(connectorPartitionHandles.size() == nodePartitionMap.getBucketToPartition().length);
}
stageNodeList = nodePartitionMap.getPartitionToNode();
bucketNodeMap = nodePartitionMap.asBucketNodeMap();
}
FixedSourcePartitionedScheduler fixedSourcePartitionedScheduler = new FixedSourcePartitionedScheduler(stageExecution, splitSources, plan.getFragment().getStageExecutionDescriptor(), schedulingOrder, stageNodeList, bucketNodeMap, splitBatchSize, getConcurrentLifespansPerNode(session), nodeScheduler.createNodeSelector(session, connectorId), connectorPartitionHandles);
if (plan.getFragment().getStageExecutionDescriptor().isRecoverableGroupedExecution()) {
stageExecution.registerStageTaskRecoveryCallback(taskId -> {
checkArgument(taskId.getStageExecutionId().getStageId().equals(stageId), "The task did not execute this stage");
checkArgument(parentStageExecution.isPresent(), "Parent stage execution must exist");
checkArgument(parentStageExecution.get().getAllTasks().size() == 1, "Parent stage should only have one task for recoverable grouped execution");
parentStageExecution.get().removeRemoteSourceIfSingleTaskStage(taskId);
fixedSourcePartitionedScheduler.recover(taskId);
});
}
return fixedSourcePartitionedScheduler;
} else {
// all sources are remote
NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning());
List<InternalNode> partitionToNode = nodePartitionMap.getPartitionToNode();
// todo this should asynchronously wait a standard timeout period before failing
checkCondition(!partitionToNode.isEmpty(), NO_NODES_AVAILABLE, "No worker nodes available");
return new FixedCountScheduler(stageExecution, partitionToNode);
}
}
}
Aggregations