use of io.prestosql.split.SplitSource in project hetu-core by openlookeng.
the class DistributedExecutionPlanner method plan.
public StageExecutionPlan plan(SubPlan root, Session session, Mode mode, Long resumeSnapshotId, long nextSnapshotId) {
ImmutableList.Builder<SplitSource> allSplitSources = ImmutableList.builder();
try {
if (mode != Mode.SNAPSHOT) {
return doPlan(mode, root, session, resumeSnapshotId, nextSnapshotId, allSplitSources, null, null, null);
}
// Capture dependencies among table scan sources. Only need to do this for the initial planning.
// The leftmost source of each fragment. Key is fragment id; value is SplitSource or ValuesNode or RemoteSourceNode
Map<PlanFragmentId, Object> leftmostSources = new HashMap<>();
// Source dependency. Key is SplitSource or ValuesNode or RemoteSourceNode; value is SplitSource or ValuesNode or RemoteSourceNode
Multimap<Object, Object> sourceDependencies = HashMultimap.create();
// List of sources from the same union. Values are SplitSource or ValuesNode or RemoteSourceNode.
List<List<Object>> unionSources = new ArrayList<>();
StageExecutionPlan ret = doPlan(mode, root, session, resumeSnapshotId, nextSnapshotId, allSplitSources, leftmostSources, sourceDependencies, unionSources);
for (Map.Entry<Object, Object> entry : sourceDependencies.entries()) {
List<MarkerSplitSource> right = collectSources(leftmostSources, entry.getValue());
for (MarkerSplitSource source : collectSources(leftmostSources, entry.getKey())) {
for (SplitSource dependency : right) {
source.addDependency((MarkerSplitSource) dependency);
}
}
}
List<MarkerSplitSource> sources = new ArrayList<>();
for (List<Object> union : unionSources) {
sources.clear();
for (Object unionSource : union) {
sources.addAll(collectSources(leftmostSources, unionSource));
}
// Adding all these sources as "union dependencies" for each other, to make sure they produce the same set of markers.
for (MarkerSplitSource source : sources) {
source.addUnionSources(sources);
}
}
return ret;
} catch (Throwable t) {
allSplitSources.build().forEach(DistributedExecutionPlanner::closeSplitSource);
throw t;
}
}
use of io.prestosql.split.SplitSource in project hetu-core by openlookeng.
the class TestSourcePartitionedScheduler method getSourcePartitionedScheduler.
private static StageScheduler getSourcePartitionedScheduler(StageExecutionPlan plan, SqlStageExecution stage, InternalNodeManager nodeManager, NodeTaskMap nodeTaskMap, int splitBatchSize) {
NodeSchedulerConfig nodeSchedulerConfig = new NodeSchedulerConfig().setIncludeCoordinator(false).setMaxSplitsPerNode(20).setMaxPendingSplitsPerTask(0);
NodeScheduler nodeScheduler = new NodeScheduler(new LegacyNetworkTopology(), nodeManager, nodeSchedulerConfig, nodeTaskMap);
PlanNodeId sourceNode = Iterables.getOnlyElement(plan.getSplitSources().keySet());
SplitSource splitSource = Iterables.getOnlyElement(plan.getSplitSources().values());
SplitPlacementPolicy placementPolicy = new DynamicSplitPlacementPolicy(nodeScheduler.createNodeSelector(splitSource.getCatalogName(), false, null), stage::getAllTasks);
return newSourcePartitionedSchedulerAsStageScheduler(stage, sourceNode, splitSource, placementPolicy, splitBatchSize, session, new HeuristicIndexerManager(new FileSystemClientManager(), new HetuMetaStoreManager()));
}
use of io.prestosql.split.SplitSource in project hetu-core by openlookeng.
the class ExtractSpatialJoins method loadKdbTree.
private static KdbTree loadKdbTree(String tableName, Session session, Metadata metadata, SplitManager splitManager, PageSourceManager pageSourceManager, PlanNodeId nodeId) {
QualifiedObjectName name = toQualifiedObjectName(tableName, session.getCatalog().get(), session.getSchema().get());
TableHandle tableHandle = metadata.getTableHandle(session, name).orElseThrow(() -> new PrestoException(INVALID_SPATIAL_PARTITIONING, format("Table not found: %s", name)));
Map<String, ColumnHandle> columnHandles = metadata.getColumnHandles(session, tableHandle);
List<ColumnHandle> visibleColumnHandles = columnHandles.values().stream().filter(handle -> !metadata.getColumnMetadata(session, tableHandle, handle).isHidden()).collect(toImmutableList());
checkSpatialPartitioningTable(visibleColumnHandles.size() == 1, "Expected single column for table %s, but found %s columns", name, columnHandles.size());
ColumnHandle kdbTreeColumn = Iterables.getOnlyElement(visibleColumnHandles);
Optional<KdbTree> kdbTree = Optional.empty();
try (SplitSource splitSource = splitManager.getSplits(session, tableHandle, UNGROUPED_SCHEDULING, null, Optional.empty(), Collections.emptyMap(), ImmutableSet.of(), false, nodeId)) {
while (!Thread.currentThread().isInterrupted()) {
SplitBatch splitBatch = getFutureValue(splitSource.getNextBatch(NOT_PARTITIONED, Lifespan.taskWide(), 1000));
List<Split> splits = splitBatch.getSplits();
for (Split split : splits) {
try (ConnectorPageSource pageSource = pageSourceManager.createPageSource(session, split, tableHandle, ImmutableList.of(kdbTreeColumn), Optional.empty())) {
do {
getFutureValue(pageSource.isBlocked());
Page page = pageSource.getNextPage();
if (page != null && page.getPositionCount() > 0) {
checkSpatialPartitioningTable(!kdbTree.isPresent(), "Expected exactly one row for table %s, but found more", name);
checkSpatialPartitioningTable(page.getPositionCount() == 1, "Expected exactly one row for table %s, but found %s rows", name, page.getPositionCount());
String kdbTreeJson = VARCHAR.getSlice(page.getBlock(0), 0).toStringUtf8();
try {
kdbTree = Optional.of(KdbTreeUtils.fromJson(kdbTreeJson));
} catch (IllegalArgumentException e) {
checkSpatialPartitioningTable(false, "Invalid JSON string for KDB tree: %s", e.getMessage());
}
}
} while (!pageSource.isFinished());
} catch (IOException e) {
throw new UncheckedIOException(e);
}
}
if (splitBatch.isLastBatch()) {
break;
}
}
}
checkSpatialPartitioningTable(kdbTree.isPresent(), "Expected exactly one row for table %s, but got none", name);
return kdbTree.get();
}
use of io.prestosql.split.SplitSource in project hetu-core by openlookeng.
the class SqlQueryScheduler method createStages.
private List<SqlStageExecution> createStages(ExchangeLocationsConsumer parent, AtomicInteger nextStageId, LocationFactory locationFactory, StageExecutionPlan plan, NodeScheduler nodeScheduler, RemoteTaskFactory remoteTaskFactory, Session session, int splitBatchSize, BiFunction<PartitioningHandle, Integer, NodePartitionMap> partitioningCache, NodePartitioningManager nodePartitioningManager, ExecutorService queryExecutor, ScheduledExecutorService schedulerExecutor, FailureDetector failureDetector, NodeTaskMap nodeTaskMap, ImmutableMap.Builder<StageId, StageScheduler> stageSchedulers, ImmutableMap.Builder<StageId, StageLinkage> stageLinkages, boolean isSnapshotEnabled, QuerySnapshotManager snapshotManager, Map<StageId, Integer> stageTaskCounts) {
ImmutableList.Builder<SqlStageExecution> localStages = ImmutableList.builder();
StageId stageId = new StageId(queryStateMachine.getQueryId(), nextStageId.getAndIncrement());
SqlStageExecution stageExecution = createSqlStageExecution(stageId, locationFactory.createStageLocation(stageId), plan.getFragment(), plan.getTables(), remoteTaskFactory, session, summarizeTaskInfo, nodeTaskMap, queryExecutor, failureDetector, schedulerStats, dynamicFilterService, snapshotManager);
localStages.add(stageExecution);
Optional<int[]> bucketToPartition;
PartitioningHandle partitioningHandle = plan.getFragment().getPartitioning();
boolean keepConsumerOnFeederNodes = !plan.getFragment().getFeederCTEId().isPresent() && plan.getFragment().getFeederCTEParentId().isPresent();
if (partitioningHandle.equals(SOURCE_DISTRIBUTION)) {
// nodes are selected dynamically based on the constraints of the splits and the system load
Entry<PlanNodeId, SplitSource> entry = Iterables.getOnlyElement(plan.getSplitSources().entrySet());
PlanNodeId planNodeId = entry.getKey();
SplitSource splitSource = entry.getValue();
CatalogName catalogName = splitSource.getCatalogName();
if (isInternalSystemConnector(catalogName)) {
catalogName = null;
}
NodeSelector nodeSelector = nodeScheduler.createNodeSelector(catalogName, keepConsumerOnFeederNodes, feederScheduledNodes);
if (isSnapshotEnabled) {
// When snapshot is enabled, then no task can be added after the query started running,
// otherwise assumptions about how many "input channels" may be broken.
nodeSelector.lockDownNodes();
}
SplitPlacementPolicy placementPolicy = new DynamicSplitPlacementPolicy(nodeSelector, stageExecution::getAllTasks);
checkArgument(!plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution());
stageSchedulers.put(stageId, newSourcePartitionedSchedulerAsStageScheduler(stageExecution, planNodeId, splitSource, placementPolicy, splitBatchSize, session, heuristicIndexerManager));
bucketToPartition = Optional.of(new int[1]);
} else if (partitioningHandle.equals(SCALED_WRITER_DISTRIBUTION)) {
bucketToPartition = Optional.of(new int[1]);
} else {
Map<PlanNodeId, SplitSource> splitSources = plan.getSplitSources();
if (!splitSources.isEmpty()) {
// contains local source
List<PlanNodeId> schedulingOrder = plan.getFragment().getPartitionedSources();
CatalogName catalogName = partitioningHandle.getConnectorId().orElseThrow(IllegalStateException::new);
List<ConnectorPartitionHandle> connectorPartitionHandles;
boolean groupedExecutionForStage = plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution();
if (groupedExecutionForStage) {
connectorPartitionHandles = nodePartitioningManager.listPartitionHandles(session, partitioningHandle);
checkState(!ImmutableList.of(NOT_PARTITIONED).equals(connectorPartitionHandles));
} else {
connectorPartitionHandles = ImmutableList.of(NOT_PARTITIONED);
}
BucketNodeMap bucketNodeMap;
List<InternalNode> stageNodeList;
if (plan.getFragment().getRemoteSourceNodes().stream().allMatch(node -> node.getExchangeType() == REPLICATE)) {
// no remote source
boolean dynamicLifespanSchedule = plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule();
if (isSnapshotEnabled) {
NodeSelector nodeSelector = nodeScheduler.createNodeSelector(catalogName, keepConsumerOnFeederNodes, feederScheduledNodes);
int nodeCount;
if (stageTaskCounts != null) {
// Resuming: need to create same number of tasks as old stage.
nodeCount = stageTaskCounts.get(stageId);
} else {
// Scheduling: reserve some nodes for resuming
nodeCount = calculateTaskCount(nodeSelector.selectableNodeCount());
}
stageNodeList = new ArrayList<>(nodeSelector.selectRandomNodes(nodeCount));
checkCondition(stageNodeList.size() == nodeCount, NO_NODES_AVAILABLE, "Snapshot: not enough worker nodes to resume expected number of tasks: " + nodeCount);
// Make sure bucketNodeMap uses the same node list
bucketNodeMap = nodePartitioningManager.getBucketNodeMap(session, partitioningHandle, dynamicLifespanSchedule, stageNodeList);
} else {
bucketNodeMap = nodePartitioningManager.getBucketNodeMap(session, partitioningHandle, dynamicLifespanSchedule);
stageNodeList = new ArrayList<>(nodeScheduler.createNodeSelector(catalogName, keepConsumerOnFeederNodes, feederScheduledNodes).allNodes());
}
// verify execution is consistent with planner's decision on dynamic lifespan schedule
verify(bucketNodeMap.isDynamic() == dynamicLifespanSchedule);
Collections.shuffle(stageNodeList);
bucketToPartition = Optional.empty();
} else {
// cannot use dynamic lifespan schedule
verify(!plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule());
// remote source requires nodePartitionMap
NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning(), stageTaskCounts == null ? null : stageTaskCounts.get(stageId));
if (groupedExecutionForStage) {
checkState(connectorPartitionHandles.size() == nodePartitionMap.getBucketToPartition().length);
}
stageNodeList = nodePartitionMap.getPartitionToNode();
bucketNodeMap = nodePartitionMap.asBucketNodeMap();
bucketToPartition = Optional.of(nodePartitionMap.getBucketToPartition());
}
stageSchedulers.put(stageId, new FixedSourcePartitionedScheduler(stageExecution, splitSources, plan.getFragment().getStageExecutionDescriptor(), schedulingOrder, stageNodeList, bucketNodeMap, splitBatchSize, getConcurrentLifespansPerNode(session), nodeScheduler.createNodeSelector(catalogName, keepConsumerOnFeederNodes, feederScheduledNodes), connectorPartitionHandles, session, heuristicIndexerManager));
} else {
// all sources are remote
NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning(), stageTaskCounts == null ? null : stageTaskCounts.get(stageId));
List<InternalNode> partitionToNode = nodePartitionMap.getPartitionToNode();
// todo this should asynchronously wait a standard timeout period before failing
checkCondition(!partitionToNode.isEmpty(), NO_NODES_AVAILABLE, "No worker nodes available");
stageSchedulers.put(stageId, new FixedCountScheduler(stageExecution, partitionToNode));
bucketToPartition = Optional.of(nodePartitionMap.getBucketToPartition());
}
}
ImmutableSet.Builder<SqlStageExecution> childStagesBuilder = ImmutableSet.builder();
for (StageExecutionPlan subStagePlan : plan.getSubStages()) {
if (visitedPlanFrags.contains(subStagePlan.getFragment().getId())) {
continue;
}
visitedPlanFrags.add(subStagePlan.getFragment().getId());
List<SqlStageExecution> subTree = createStages(stageExecution::addExchangeLocations, nextStageId, locationFactory, subStagePlan.withBucketToPartition(bucketToPartition), nodeScheduler, remoteTaskFactory, session, splitBatchSize, partitioningCache, nodePartitioningManager, queryExecutor, schedulerExecutor, failureDetector, nodeTaskMap, stageSchedulers, stageLinkages, isSnapshotEnabled, snapshotManager, stageTaskCounts);
localStages.addAll(subTree);
SqlStageExecution childStage = subTree.get(0);
childStagesBuilder.add(childStage);
Optional<RemoteSourceNode> parentNode = plan.getFragment().getRemoteSourceNodes().stream().filter(x -> x.getSourceFragmentIds().contains(childStage.getFragment().getId())).findAny();
checkArgument(parentNode.isPresent(), "Couldn't find parent of a CTE node");
childStage.setParentId(parentNode.get().getId());
}
Set<SqlStageExecution> childStages = childStagesBuilder.build();
stageExecution.addStateChangeListener(newState -> {
if (newState.isDone() && newState != StageState.RESCHEDULING) {
// Snapshot: For "rescheduling", tasks are already cancelled (for resume)
childStages.forEach(SqlStageExecution::cancel);
}
});
stageLinkages.put(stageId, new StageLinkage(plan.getFragment().getId(), parent, childStages));
if (partitioningHandle.equals(SCALED_WRITER_DISTRIBUTION)) {
Supplier<Collection<TaskStatus>> sourceTasksProvider = () -> childStages.stream().map(SqlStageExecution::getAllTasks).flatMap(Collection::stream).map(RemoteTask::getTaskStatus).collect(toList());
Supplier<Collection<TaskStatus>> writerTasksProvider = () -> stageExecution.getAllTasks().stream().map(RemoteTask::getTaskStatus).collect(toList());
ScaledWriterScheduler scheduler = new ScaledWriterScheduler(stageExecution, sourceTasksProvider, writerTasksProvider, nodeScheduler.createNodeSelector(null, keepConsumerOnFeederNodes, feederScheduledNodes), schedulerExecutor, getWriterMinSize(session), isSnapshotEnabled, stageTaskCounts != null ? stageTaskCounts.get(stageId) : null);
whenAllStages(childStages, StageState::isDone).addListener(scheduler::finish, directExecutor());
stageSchedulers.put(stageId, scheduler);
}
return localStages.build();
}
use of io.prestosql.split.SplitSource in project hetu-core by openlookeng.
the class SourcePartitionedScheduler method schedule.
@Override
public synchronized ScheduleResult schedule(int maxSplitGroup) {
dropListenersFromWhenFinishedOrNewLifespansAdded();
int overallSplitAssignmentCount = 0;
ImmutableSet.Builder<RemoteTask> overallNewTasks = ImmutableSet.builder();
List<ListenableFuture<?>> overallBlockedFutures = new ArrayList<>();
boolean anyBlockedOnPlacements = false;
boolean anyBlockedOnNextSplitBatch = false;
boolean anyNotBlocked = false;
boolean applyFilter = isHeuristicIndexFilterEnabled(session) && SplitFiltering.isSplitFilterApplicable(stage);
boolean initialMarker = false;
for (Entry<Lifespan, ScheduleGroup> entry : scheduleGroups.entrySet()) {
Lifespan lifespan = entry.getKey();
ScheduleGroup scheduleGroup = entry.getValue();
Set<Split> pendingSplits = scheduleGroup.pendingSplits;
if (scheduleGroup.state == ScheduleGroupState.NO_MORE_SPLITS || scheduleGroup.state == ScheduleGroupState.DONE) {
verify(scheduleGroup.nextSplitBatchFuture == null);
} else if (pendingSplits.isEmpty()) {
// try to get the next batch
if (scheduleGroup.nextSplitBatchFuture == null) {
scheduleGroup.nextSplitBatchFuture = splitSource.getNextBatch(scheduleGroup.partitionHandle, lifespan, splitBatchSize - pendingSplits.size());
long start = System.nanoTime();
addSuccessCallback(scheduleGroup.nextSplitBatchFuture, () -> stage.recordGetSplitTime(start));
}
if (scheduleGroup.nextSplitBatchFuture.isDone()) {
SplitBatch nextSplits = getFutureValue(scheduleGroup.nextSplitBatchFuture);
scheduleGroup.nextSplitBatchFuture = null;
// add split filter to filter out split has no valid rows
Pair<Optional<RowExpression>, Map<Symbol, ColumnHandle>> pair = SplitFiltering.getExpression(stage);
if (SystemSessionProperties.isSnapshotEnabled(session)) {
List<Split> batchSplits = nextSplits.getSplits();
// Don't apply filter to MarkerSplit
if (batchSplits.size() == 1 && batchSplits.get(0).getConnectorSplit() instanceof MarkerSplit) {
applyFilter = false;
}
}
List<Split> filteredSplit = applyFilter ? SplitFiltering.getFilteredSplit(pair.getFirst(), SplitFiltering.getFullyQualifiedName(stage), pair.getSecond(), nextSplits, heuristicIndexerManager) : nextSplits.getSplits();
// In case of ORC small size files/splits are grouped
List<Split> groupedSmallFilesList = splitSource.groupSmallSplits(filteredSplit, lifespan, maxSplitGroup);
filteredSplit = groupedSmallFilesList;
pendingSplits.addAll(filteredSplit);
if (nextSplits.isLastBatch()) {
if (scheduleGroup.state == ScheduleGroupState.INITIALIZED && pendingSplits.isEmpty()) {
// Add an empty split in case no splits have been produced for the source.
// For source operators, they never take input, but they may produce output.
// This is well handled by Presto execution engine.
// However, there are certain non-source operators that may produce output without any input,
// for example, 1) an AggregationOperator, 2) a HashAggregationOperator where one of the grouping sets is ().
// Scheduling an empty split kicks off necessary driver instantiation to make this work.
pendingSplits.add(new Split(splitSource.getCatalogName(), new EmptySplit(splitSource.getCatalogName()), lifespan));
}
scheduleGroup.state = ScheduleGroupState.NO_MORE_SPLITS;
}
} else {
overallBlockedFutures.add(scheduleGroup.nextSplitBatchFuture);
anyBlockedOnNextSplitBatch = true;
continue;
}
}
Multimap<InternalNode, Split> splitAssignment = ImmutableMultimap.of();
if (!pendingSplits.isEmpty()) {
if (!scheduleGroup.placementFuture.isDone()) {
anyBlockedOnPlacements = true;
continue;
}
if (scheduleGroup.state == ScheduleGroupState.INITIALIZED) {
scheduleGroup.state = ScheduleGroupState.SPLITS_ADDED;
}
if (state == State.INITIALIZED) {
state = State.SPLITS_ADDED;
}
// calculate placements for splits
SplitPlacementResult splitPlacementResult;
if (stage.isThrottledSchedule()) {
// If asked for partial schedule incase of lesser resource, then schedule only 10% of splits.
// 10% is calculated on initial number of splits and same is being used on subsequent schedule also.
// But if later 10% of current pending splits more than earlier 10%, then it will schedule max of
// these.
// if throttledSplitsCount is more than number of pendingSplits, then it will schedule all.
throttledSplitsCount = Math.max((int) Math.ceil(pendingSplits.size() * ALLOWED_PERCENT_LIMIT), throttledSplitsCount);
splitPlacementResult = splitPlacementPolicy.computeAssignments(ImmutableSet.copyOf(Iterables.limit(pendingSplits, throttledSplitsCount)), this.stage);
} else {
splitPlacementResult = splitPlacementPolicy.computeAssignments(new HashSet<>(pendingSplits), this.stage);
}
splitAssignment = splitPlacementResult.getAssignments();
if (SystemSessionProperties.isSnapshotEnabled(session)) {
Split firstSplit = pendingSplits.iterator().next();
if (pendingSplits.size() == 1 && firstSplit.getConnectorSplit() instanceof MarkerSplit) {
// We'll create a new assignment, but still need to call computeAssignments above, and cannot modify the returned assignment map directly
splitAssignment = HashMultimap.create(splitAssignment);
splitAssignment.values().remove(firstSplit);
// Getting all internalNodes and assigning marker splits to all of them.
List<InternalNode> allNodes = splitPlacementPolicy.allNodes();
for (InternalNode node : allNodes) {
splitAssignment.put(node, firstSplit);
}
MarkerSplit markerSplit = (MarkerSplit) firstSplit.getConnectorSplit();
// then set the flag below to true, so stages enter SCHEDULING_SPLITS state.
if (markerSplit.isResuming() || markerSplit.getSnapshotId() == 0) {
initialMarker = true;
}
} else {
// MarkerSplit should be in its own batch.
verify(pendingSplits.stream().noneMatch(split -> split.getConnectorSplit() instanceof MarkerSplit));
}
}
// remove splits with successful placements
// AbstractSet.removeAll performs terribly here.
splitAssignment.values().forEach(pendingSplits::remove);
overallSplitAssignmentCount += splitAssignment.size();
// if not completed placed, mark scheduleGroup as blocked on placement
if (!pendingSplits.isEmpty()) {
scheduleGroup.placementFuture = splitPlacementResult.getBlocked();
overallBlockedFutures.add(scheduleGroup.placementFuture);
anyBlockedOnPlacements = true;
}
}
// if no new splits will be assigned, update state and attach completion event
Multimap<InternalNode, Lifespan> noMoreSplitsNotification = ImmutableMultimap.of();
if (pendingSplits.isEmpty() && scheduleGroup.state == ScheduleGroupState.NO_MORE_SPLITS) {
scheduleGroup.state = ScheduleGroupState.DONE;
if (!lifespan.isTaskWide()) {
InternalNode node = ((BucketedSplitPlacementPolicy) splitPlacementPolicy).getNodeForBucket(lifespan.getId());
noMoreSplitsNotification = ImmutableMultimap.of(node, lifespan);
}
}
// assign the splits with successful placements
overallNewTasks.addAll(assignSplits(splitAssignment, noMoreSplitsNotification));
// As a result, to avoid busy loops caused by 1, we check pendingSplits.isEmpty() instead of placementFuture.isDone() here.
if (scheduleGroup.nextSplitBatchFuture == null && scheduleGroup.pendingSplits.isEmpty() && scheduleGroup.state != ScheduleGroupState.DONE) {
anyNotBlocked = true;
}
}
// Next time it invokes getNextBatch, it will realize that. However, the invocation will fail we tear down splitSource now.
if ((state == State.NO_MORE_SPLITS || state == State.FINISHED) || (noMoreScheduleGroups && scheduleGroups.isEmpty() && splitSource.isFinished())) {
switch(state) {
case INITIALIZED:
// But this shouldn't be possible. See usage of EmptySplit in this method.
throw new IllegalStateException("At least 1 split should have been scheduled for this plan node");
case SPLITS_ADDED:
state = State.NO_MORE_SPLITS;
splitSource.close();
// fall through
case NO_MORE_SPLITS:
state = State.FINISHED;
whenFinishedOrNewLifespanAdded.set(null);
// fall through
case FINISHED:
return new ScheduleResult(true, overallNewTasks.build(), overallSplitAssignmentCount);
default:
throw new IllegalStateException("Unknown state");
}
}
if (anyNotBlocked) {
if (initialMarker) {
stage.transitionToSchedulingSplits();
}
return new ScheduleResult(false, overallNewTasks.build(), overallSplitAssignmentCount);
}
if (anyBlockedOnPlacements || groupedExecution) {
// In a broadcast join, output buffers of the tasks in build source stage have to
// hold onto all data produced before probe side task scheduling finishes,
// even if the data is acknowledged by all known consumers. This is because
// new consumers may be added until the probe side task scheduling finishes.
//
// As a result, the following line is necessary to prevent deadlock
// due to neither build nor probe can make any progress.
// The build side blocks due to a full output buffer.
// In the meantime the probe side split cannot be consumed since
// builder side hash table construction has not finished.
overallNewTasks.addAll(finalizeTaskCreationIfNecessary());
}
ScheduleResult.BlockedReason blockedReason;
if (anyBlockedOnNextSplitBatch) {
blockedReason = anyBlockedOnPlacements ? MIXED_SPLIT_QUEUES_FULL_AND_WAITING_FOR_SOURCE : WAITING_FOR_SOURCE;
} else {
blockedReason = anyBlockedOnPlacements ? SPLIT_QUEUES_FULL : NO_ACTIVE_DRIVER_GROUP;
}
overallBlockedFutures.add(whenFinishedOrNewLifespanAdded);
return new ScheduleResult(false, overallNewTasks.build(), nonCancellationPropagating(whenAnyComplete(overallBlockedFutures)), blockedReason, overallSplitAssignmentCount);
}
Aggregations