use of com.facebook.presto.sql.planner.SystemPartitioningHandle.SCALED_WRITER_DISTRIBUTION in project presto by prestodb.
the class SectionExecutionFactory method createStageScheduler.
private StageScheduler createStageScheduler(SplitSourceFactory splitSourceFactory, Session session, StreamingSubPlan plan, Function<PartitioningHandle, NodePartitionMap> partitioningCache, Optional<SqlStageExecution> parentStageExecution, StageId stageId, SqlStageExecution stageExecution, PartitioningHandle partitioningHandle, TableWriteInfo tableWriteInfo, Set<SqlStageExecution> childStageExecutions) {
Map<PlanNodeId, SplitSource> splitSources = splitSourceFactory.createSplitSources(plan.getFragment(), session, tableWriteInfo);
int maxTasksPerStage = getMaxTasksPerStage(session);
if (partitioningHandle.equals(SOURCE_DISTRIBUTION)) {
// nodes are selected dynamically based on the constraints of the splits and the system load
Map.Entry<PlanNodeId, SplitSource> entry = getOnlyElement(splitSources.entrySet());
PlanNodeId planNodeId = entry.getKey();
SplitSource splitSource = entry.getValue();
ConnectorId connectorId = splitSource.getConnectorId();
if (isInternalSystemConnector(connectorId)) {
connectorId = null;
}
NodeSelector nodeSelector = nodeScheduler.createNodeSelector(session, connectorId, maxTasksPerStage);
SplitPlacementPolicy placementPolicy = new DynamicSplitPlacementPolicy(nodeSelector, stageExecution::getAllTasks);
checkArgument(!plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution());
return newSourcePartitionedSchedulerAsStageScheduler(stageExecution, planNodeId, splitSource, placementPolicy, splitBatchSize);
} else if (partitioningHandle.equals(SCALED_WRITER_DISTRIBUTION)) {
Supplier<Collection<TaskStatus>> sourceTasksProvider = () -> childStageExecutions.stream().map(SqlStageExecution::getAllTasks).flatMap(Collection::stream).map(RemoteTask::getTaskStatus).collect(toList());
Supplier<Collection<TaskStatus>> writerTasksProvider = () -> stageExecution.getAllTasks().stream().map(RemoteTask::getTaskStatus).collect(toList());
ScaledWriterScheduler scheduler = new ScaledWriterScheduler(stageExecution, sourceTasksProvider, writerTasksProvider, nodeScheduler.createNodeSelector(session, null), scheduledExecutor, getWriterMinSize(session), isOptimizedScaleWriterProducerBuffer(session));
whenAllStages(childStageExecutions, StageExecutionState::isDone).addListener(scheduler::finish, directExecutor());
return scheduler;
} else {
if (!splitSources.isEmpty()) {
// contains local source
List<PlanNodeId> schedulingOrder = plan.getFragment().getTableScanSchedulingOrder();
ConnectorId connectorId = partitioningHandle.getConnectorId().orElseThrow(IllegalStateException::new);
List<ConnectorPartitionHandle> connectorPartitionHandles;
boolean groupedExecutionForStage = plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution();
if (groupedExecutionForStage) {
connectorPartitionHandles = nodePartitioningManager.listPartitionHandles(session, partitioningHandle);
checkState(!ImmutableList.of(NOT_PARTITIONED).equals(connectorPartitionHandles));
} else {
connectorPartitionHandles = ImmutableList.of(NOT_PARTITIONED);
}
BucketNodeMap bucketNodeMap;
List<InternalNode> stageNodeList;
if (plan.getFragment().getRemoteSourceNodes().stream().allMatch(node -> node.getExchangeType() == REPLICATE)) {
// no non-replicated remote source
boolean dynamicLifespanSchedule = plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule();
bucketNodeMap = nodePartitioningManager.getBucketNodeMap(session, partitioningHandle, dynamicLifespanSchedule);
// verify execution is consistent with planner's decision on dynamic lifespan schedule
verify(bucketNodeMap.isDynamic() == dynamicLifespanSchedule);
if (bucketNodeMap.hasInitialMap()) {
stageNodeList = bucketNodeMap.getBucketToNode().get().stream().distinct().collect(toImmutableList());
} else {
stageNodeList = new ArrayList<>(nodeScheduler.createNodeSelector(session, connectorId).selectRandomNodes(maxTasksPerStage));
}
} else {
// cannot use dynamic lifespan schedule
verify(!plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule());
// remote source requires nodePartitionMap
NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning());
if (groupedExecutionForStage) {
checkState(connectorPartitionHandles.size() == nodePartitionMap.getBucketToPartition().length);
}
stageNodeList = nodePartitionMap.getPartitionToNode();
bucketNodeMap = nodePartitionMap.asBucketNodeMap();
}
FixedSourcePartitionedScheduler fixedSourcePartitionedScheduler = new FixedSourcePartitionedScheduler(stageExecution, splitSources, plan.getFragment().getStageExecutionDescriptor(), schedulingOrder, stageNodeList, bucketNodeMap, splitBatchSize, getConcurrentLifespansPerNode(session), nodeScheduler.createNodeSelector(session, connectorId), connectorPartitionHandles);
if (plan.getFragment().getStageExecutionDescriptor().isRecoverableGroupedExecution()) {
stageExecution.registerStageTaskRecoveryCallback(taskId -> {
checkArgument(taskId.getStageExecutionId().getStageId().equals(stageId), "The task did not execute this stage");
checkArgument(parentStageExecution.isPresent(), "Parent stage execution must exist");
checkArgument(parentStageExecution.get().getAllTasks().size() == 1, "Parent stage should only have one task for recoverable grouped execution");
parentStageExecution.get().removeRemoteSourceIfSingleTaskStage(taskId);
fixedSourcePartitionedScheduler.recover(taskId);
});
}
return fixedSourcePartitionedScheduler;
} else {
// all sources are remote
NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning());
List<InternalNode> partitionToNode = nodePartitionMap.getPartitionToNode();
// todo this should asynchronously wait a standard timeout period before failing
checkCondition(!partitionToNode.isEmpty(), NO_NODES_AVAILABLE, "No worker nodes available");
return new FixedCountScheduler(stageExecution, partitionToNode);
}
}
}
use of com.facebook.presto.sql.planner.SystemPartitioningHandle.SCALED_WRITER_DISTRIBUTION in project presto by prestodb.
the class LocalExecutionPlanner method createOutputPartitioning.
private Optional<OutputPartitioning> createOutputPartitioning(TaskContext taskContext, PartitioningScheme partitioningScheme) {
if (partitioningScheme.getPartitioning().getHandle().equals(FIXED_BROADCAST_DISTRIBUTION) || partitioningScheme.getPartitioning().getHandle().equals(FIXED_ARBITRARY_DISTRIBUTION) || partitioningScheme.getPartitioning().getHandle().equals(SCALED_WRITER_DISTRIBUTION) || partitioningScheme.getPartitioning().getHandle().equals(SINGLE_DISTRIBUTION) || partitioningScheme.getPartitioning().getHandle().equals(COORDINATOR_DISTRIBUTION)) {
return Optional.empty();
}
List<VariableReferenceExpression> outputLayout = partitioningScheme.getOutputLayout();
// We can convert the variables directly into channels, because the root must be a sink and therefore the layout is fixed
List<Integer> partitionChannels;
List<Optional<ConstantExpression>> partitionConstants;
List<Type> partitionChannelTypes;
if (partitioningScheme.getHashColumn().isPresent()) {
partitionChannels = ImmutableList.of(outputLayout.indexOf(partitioningScheme.getHashColumn().get()));
partitionConstants = ImmutableList.of(Optional.empty());
partitionChannelTypes = ImmutableList.of(BIGINT);
} else {
checkArgument(partitioningScheme.getPartitioning().getArguments().stream().allMatch(argument -> argument instanceof ConstantExpression || argument instanceof VariableReferenceExpression), format("Expect all partitioning arguments to be either ConstantExpression or VariableReferenceExpression, but get %s", partitioningScheme.getPartitioning().getArguments()));
partitionChannels = partitioningScheme.getPartitioning().getArguments().stream().map(argument -> {
if (argument instanceof ConstantExpression) {
return -1;
}
return outputLayout.indexOf(argument);
}).collect(toImmutableList());
partitionConstants = partitioningScheme.getPartitioning().getArguments().stream().map(argument -> {
if (argument instanceof ConstantExpression) {
return Optional.of((ConstantExpression) argument);
}
return Optional.<ConstantExpression>empty();
}).collect(toImmutableList());
partitionChannelTypes = partitioningScheme.getPartitioning().getArguments().stream().map(RowExpression::getType).collect(toImmutableList());
}
PartitionFunction partitionFunction = nodePartitioningManager.getPartitionFunction(taskContext.getSession(), partitioningScheme, partitionChannelTypes);
OptionalInt nullChannel = OptionalInt.empty();
Set<VariableReferenceExpression> partitioningColumns = partitioningScheme.getPartitioning().getVariableReferences();
// partitioningColumns expected to have one column in the normal case, and zero columns when partitioning on a constant
checkArgument(!partitioningScheme.isReplicateNullsAndAny() || partitioningColumns.size() <= 1);
if (partitioningScheme.isReplicateNullsAndAny() && partitioningColumns.size() == 1) {
nullChannel = OptionalInt.of(outputLayout.indexOf(getOnlyElement(partitioningColumns)));
}
return Optional.of(new OutputPartitioning(partitionFunction, partitionChannels, partitionConstants, partitioningScheme.isReplicateNullsAndAny(), nullChannel));
}
Aggregations