use of com.facebook.presto.execution.StageId in project presto by prestodb.
the class SqlQueryScheduler method buildStageInfo.
private StageInfo buildStageInfo(SubPlan subPlan, ListMultimap<StageId, SqlStageExecution> stageExecutions) {
StageId stageId = getStageId(subPlan.getFragment().getId());
List<SqlStageExecution> attempts = stageExecutions.get(stageId);
StageExecutionInfo latestAttemptInfo = attempts.isEmpty() ? unscheduledExecutionInfo(stageId.getId(), queryStateMachine.isDone()) : getLast(attempts).getStageExecutionInfo();
List<StageExecutionInfo> previousAttemptInfos = attempts.size() < 2 ? ImmutableList.of() : attempts.subList(0, attempts.size() - 1).stream().map(SqlStageExecution::getStageExecutionInfo).collect(toImmutableList());
return new StageInfo(stageId, locationFactory.createStageLocation(stageId), Optional.of(subPlan.getFragment()), latestAttemptInfo, previousAttemptInfos, subPlan.getChildren().stream().map(plan -> buildStageInfo(plan, stageExecutions)).collect(toImmutableList()), runtimeOptimizedStages.contains(stageId));
}
use of com.facebook.presto.execution.StageId in project presto by prestodb.
the class SectionExecutionFactory method createStageScheduler.
private StageScheduler createStageScheduler(SplitSourceFactory splitSourceFactory, Session session, StreamingSubPlan plan, Function<PartitioningHandle, NodePartitionMap> partitioningCache, Optional<SqlStageExecution> parentStageExecution, StageId stageId, SqlStageExecution stageExecution, PartitioningHandle partitioningHandle, TableWriteInfo tableWriteInfo, Set<SqlStageExecution> childStageExecutions) {
Map<PlanNodeId, SplitSource> splitSources = splitSourceFactory.createSplitSources(plan.getFragment(), session, tableWriteInfo);
int maxTasksPerStage = getMaxTasksPerStage(session);
if (partitioningHandle.equals(SOURCE_DISTRIBUTION)) {
// nodes are selected dynamically based on the constraints of the splits and the system load
Map.Entry<PlanNodeId, SplitSource> entry = getOnlyElement(splitSources.entrySet());
PlanNodeId planNodeId = entry.getKey();
SplitSource splitSource = entry.getValue();
ConnectorId connectorId = splitSource.getConnectorId();
if (isInternalSystemConnector(connectorId)) {
connectorId = null;
}
NodeSelector nodeSelector = nodeScheduler.createNodeSelector(session, connectorId, maxTasksPerStage);
SplitPlacementPolicy placementPolicy = new DynamicSplitPlacementPolicy(nodeSelector, stageExecution::getAllTasks);
checkArgument(!plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution());
return newSourcePartitionedSchedulerAsStageScheduler(stageExecution, planNodeId, splitSource, placementPolicy, splitBatchSize);
} else if (partitioningHandle.equals(SCALED_WRITER_DISTRIBUTION)) {
Supplier<Collection<TaskStatus>> sourceTasksProvider = () -> childStageExecutions.stream().map(SqlStageExecution::getAllTasks).flatMap(Collection::stream).map(RemoteTask::getTaskStatus).collect(toList());
Supplier<Collection<TaskStatus>> writerTasksProvider = () -> stageExecution.getAllTasks().stream().map(RemoteTask::getTaskStatus).collect(toList());
ScaledWriterScheduler scheduler = new ScaledWriterScheduler(stageExecution, sourceTasksProvider, writerTasksProvider, nodeScheduler.createNodeSelector(session, null), scheduledExecutor, getWriterMinSize(session), isOptimizedScaleWriterProducerBuffer(session));
whenAllStages(childStageExecutions, StageExecutionState::isDone).addListener(scheduler::finish, directExecutor());
return scheduler;
} else {
if (!splitSources.isEmpty()) {
// contains local source
List<PlanNodeId> schedulingOrder = plan.getFragment().getTableScanSchedulingOrder();
ConnectorId connectorId = partitioningHandle.getConnectorId().orElseThrow(IllegalStateException::new);
List<ConnectorPartitionHandle> connectorPartitionHandles;
boolean groupedExecutionForStage = plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution();
if (groupedExecutionForStage) {
connectorPartitionHandles = nodePartitioningManager.listPartitionHandles(session, partitioningHandle);
checkState(!ImmutableList.of(NOT_PARTITIONED).equals(connectorPartitionHandles));
} else {
connectorPartitionHandles = ImmutableList.of(NOT_PARTITIONED);
}
BucketNodeMap bucketNodeMap;
List<InternalNode> stageNodeList;
if (plan.getFragment().getRemoteSourceNodes().stream().allMatch(node -> node.getExchangeType() == REPLICATE)) {
// no non-replicated remote source
boolean dynamicLifespanSchedule = plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule();
bucketNodeMap = nodePartitioningManager.getBucketNodeMap(session, partitioningHandle, dynamicLifespanSchedule);
// verify execution is consistent with planner's decision on dynamic lifespan schedule
verify(bucketNodeMap.isDynamic() == dynamicLifespanSchedule);
if (bucketNodeMap.hasInitialMap()) {
stageNodeList = bucketNodeMap.getBucketToNode().get().stream().distinct().collect(toImmutableList());
} else {
stageNodeList = new ArrayList<>(nodeScheduler.createNodeSelector(session, connectorId).selectRandomNodes(maxTasksPerStage));
}
} else {
// cannot use dynamic lifespan schedule
verify(!plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule());
// remote source requires nodePartitionMap
NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning());
if (groupedExecutionForStage) {
checkState(connectorPartitionHandles.size() == nodePartitionMap.getBucketToPartition().length);
}
stageNodeList = nodePartitionMap.getPartitionToNode();
bucketNodeMap = nodePartitionMap.asBucketNodeMap();
}
FixedSourcePartitionedScheduler fixedSourcePartitionedScheduler = new FixedSourcePartitionedScheduler(stageExecution, splitSources, plan.getFragment().getStageExecutionDescriptor(), schedulingOrder, stageNodeList, bucketNodeMap, splitBatchSize, getConcurrentLifespansPerNode(session), nodeScheduler.createNodeSelector(session, connectorId), connectorPartitionHandles);
if (plan.getFragment().getStageExecutionDescriptor().isRecoverableGroupedExecution()) {
stageExecution.registerStageTaskRecoveryCallback(taskId -> {
checkArgument(taskId.getStageExecutionId().getStageId().equals(stageId), "The task did not execute this stage");
checkArgument(parentStageExecution.isPresent(), "Parent stage execution must exist");
checkArgument(parentStageExecution.get().getAllTasks().size() == 1, "Parent stage should only have one task for recoverable grouped execution");
parentStageExecution.get().removeRemoteSourceIfSingleTaskStage(taskId);
fixedSourcePartitionedScheduler.recover(taskId);
});
}
return fixedSourcePartitionedScheduler;
} else {
// all sources are remote
NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning());
List<InternalNode> partitionToNode = nodePartitionMap.getPartitionToNode();
// todo this should asynchronously wait a standard timeout period before failing
checkCondition(!partitionToNode.isEmpty(), NO_NODES_AVAILABLE, "No worker nodes available");
return new FixedCountScheduler(stageExecution, partitionToNode);
}
}
}
use of com.facebook.presto.execution.StageId in project presto by prestodb.
the class SqlQueryScheduler method buildStageInfo.
private StageInfo buildStageInfo(StageId stageId, Map<StageId, StageInfo> stageInfos) {
StageInfo parent = stageInfos.get(stageId);
checkArgument(parent != null, "No stageInfo for %s", parent);
List<StageInfo> childStages = stageLinkages.get(stageId).getChildStageIds().stream().map(childStageId -> buildStageInfo(childStageId, stageInfos)).collect(toImmutableList());
if (childStages.isEmpty()) {
return parent;
}
return new StageInfo(parent.getStageId(), parent.getState(), parent.getSelf(), parent.getPlan(), parent.getTypes(), parent.getStageStats(), parent.getTasks(), childStages, parent.getFailureCause());
}
use of com.facebook.presto.execution.StageId in project presto by prestodb.
the class SqlQueryScheduler method createStages.
private List<SqlStageExecution> createStages(Optional<SqlStageExecution> parent, AtomicInteger nextStageId, LocationFactory locationFactory, StageExecutionPlan plan, NodeScheduler nodeScheduler, RemoteTaskFactory remoteTaskFactory, Session session, int splitBatchSize, Function<PartitioningHandle, NodePartitionMap> partitioningCache, ExecutorService executor, NodeTaskMap nodeTaskMap, ImmutableMap.Builder<StageId, StageScheduler> stageSchedulers, ImmutableMap.Builder<StageId, StageLinkage> stageLinkages) {
ImmutableList.Builder<SqlStageExecution> stages = ImmutableList.builder();
StageId stageId = new StageId(queryStateMachine.getQueryId(), nextStageId.getAndIncrement());
SqlStageExecution stage = new SqlStageExecution(stageId, locationFactory.createStageLocation(stageId), plan.getFragment(), remoteTaskFactory, session, summarizeTaskInfo, nodeTaskMap, executor, schedulerStats);
stages.add(stage);
Optional<int[]> bucketToPartition;
PartitioningHandle partitioningHandle = plan.getFragment().getPartitioning();
if (partitioningHandle.equals(SOURCE_DISTRIBUTION)) {
// nodes are selected dynamically based on the constraints of the splits and the system load
Entry<PlanNodeId, SplitSource> entry = Iterables.getOnlyElement(plan.getSplitSources().entrySet());
ConnectorId connectorId = entry.getValue().getConnectorId();
if (isInternalSystemConnector(connectorId)) {
connectorId = null;
}
NodeSelector nodeSelector = nodeScheduler.createNodeSelector(connectorId);
SplitPlacementPolicy placementPolicy = new DynamicSplitPlacementPolicy(nodeSelector, stage::getAllTasks);
stageSchedulers.put(stageId, new SourcePartitionedScheduler(stage, entry.getKey(), entry.getValue(), placementPolicy, splitBatchSize));
bucketToPartition = Optional.of(new int[1]);
} else {
// nodes are pre determined by the nodePartitionMap
NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning());
Map<PlanNodeId, SplitSource> splitSources = plan.getSplitSources();
if (!splitSources.isEmpty()) {
stageSchedulers.put(stageId, new FixedSourcePartitionedScheduler(stage, splitSources, plan.getFragment().getPartitionedSources(), nodePartitionMap, splitBatchSize, nodeScheduler.createNodeSelector(null)));
bucketToPartition = Optional.of(nodePartitionMap.getBucketToPartition());
} else {
Map<Integer, Node> partitionToNode = nodePartitionMap.getPartitionToNode();
// todo this should asynchronously wait a standard timeout period before failing
checkCondition(!partitionToNode.isEmpty(), NO_NODES_AVAILABLE, "No worker nodes available");
stageSchedulers.put(stageId, new FixedCountScheduler(stage, partitionToNode));
bucketToPartition = Optional.of(nodePartitionMap.getBucketToPartition());
}
}
ImmutableSet.Builder<SqlStageExecution> childStagesBuilder = ImmutableSet.builder();
for (StageExecutionPlan subStagePlan : plan.getSubStages()) {
List<SqlStageExecution> subTree = createStages(Optional.of(stage), nextStageId, locationFactory, subStagePlan.withBucketToPartition(bucketToPartition), nodeScheduler, remoteTaskFactory, session, splitBatchSize, partitioningCache, executor, nodeTaskMap, stageSchedulers, stageLinkages);
stages.addAll(subTree);
SqlStageExecution childStage = subTree.get(0);
childStagesBuilder.add(childStage);
}
Set<SqlStageExecution> childStages = childStagesBuilder.build();
stage.addStateChangeListener(newState -> {
if (newState.isDone()) {
childStages.forEach(SqlStageExecution::cancel);
}
});
stageLinkages.put(stageId, new StageLinkage(plan.getFragment().getId(), parent, childStages));
return stages.build();
}
use of com.facebook.presto.execution.StageId in project presto by prestodb.
the class TestSourcePartitionedScheduler method createSqlStageExecution.
private SqlStageExecution createSqlStageExecution(StageExecutionPlan tableScanPlan, NodeTaskMap nodeTaskMap) {
StageId stageId = new StageId(new QueryId("query"), 0);
SqlStageExecution stage = new SqlStageExecution(stageId, locationFactory.createStageLocation(stageId), tableScanPlan.getFragment(), new MockRemoteTaskFactory(executor), TEST_SESSION, true, nodeTaskMap, executor, new SplitSchedulerStats());
stage.setOutputBuffers(createInitialEmptyOutputBuffers(PARTITIONED).withBuffer(OUT, 0).withNoMoreBufferIds());
return stage;
}
Aggregations