Search in sources :

Example 1 with REPLICATE

use of io.prestosql.sql.planner.plan.ExchangeNode.Type.REPLICATE in project hetu-core by openlookeng.

the class TestLogicalPlanner method testBroadcastCorrelatedSubqueryAvoidsRemoteExchangeBeforeAggregation.

@Test
public void testBroadcastCorrelatedSubqueryAvoidsRemoteExchangeBeforeAggregation() {
    Session broadcastJoin = Session.builder(this.getQueryRunner().getDefaultSession()).setSystemProperty(JOIN_DISTRIBUTION_TYPE, JoinDistributionType.BROADCAST.name()).setSystemProperty(FORCE_SINGLE_NODE_OUTPUT, Boolean.toString(false)).build();
    // make sure there is a remote exchange on the build side
    PlanMatchPattern joinBuildSideWithRemoteExchange = anyTree(node(JoinNode.class, anyTree(node(TableScanNode.class)), anyTree(exchange(REMOTE, REPLICATE, anyTree(node(TableScanNode.class))))));
    // validates that there exists only one remote exchange
    Consumer<Plan> validateSingleRemoteExchange = plan -> assertEquals(countOfMatchingNodes(plan, node -> node instanceof ExchangeNode && ((ExchangeNode) node).getScope() == REMOTE), 1);
    Consumer<Plan> validateSingleStreamingAggregation = plan -> assertEquals(countOfMatchingNodes(plan, node -> node instanceof AggregationNode && ((AggregationNode) node).getGroupingKeys().contains(new Symbol("unique")) && ((AggregationNode) node).isStreamable()), 1);
    // region is unpartitioned, AssignUniqueId should provide satisfying partitioning for count(*) after LEFT JOIN
    assertPlanWithSession("SELECT (SELECT count(*) FROM region r2 WHERE r2.regionkey > r1.regionkey) FROM region r1", broadcastJoin, false, joinBuildSideWithRemoteExchange, validateSingleRemoteExchange.andThen(validateSingleStreamingAggregation));
    // orders is naturally partitioned, AssignUniqueId should not overwrite its natural partitioning
    assertPlanWithSession("SELECT count(count) " + "FROM (SELECT o1.orderkey orderkey, (SELECT count(*) FROM orders o2 WHERE o2.orderkey > o1.orderkey) count FROM orders o1) " + "GROUP BY orderkey", broadcastJoin, false, joinBuildSideWithRemoteExchange, validateSingleRemoteExchange.andThen(validateSingleStreamingAggregation));
}
Also used : REPLICATED(io.prestosql.spi.plan.JoinNode.DistributionType.REPLICATED) SortNode(io.prestosql.sql.planner.plan.SortNode) JoinDistributionType(io.prestosql.sql.analyzer.FeaturesConfig.JoinDistributionType) OPTIMIZE_HASH_GENERATION(io.prestosql.SystemSessionProperties.OPTIMIZE_HASH_GENERATION) PlanMatchPattern.markDistinct(io.prestosql.sql.planner.assertions.PlanMatchPattern.markDistinct) ValueSet(io.prestosql.spi.predicate.ValueSet) Test(org.testng.annotations.Test) PlanMatchPattern.singleGroupingSet(io.prestosql.sql.planner.assertions.PlanMatchPattern.singleGroupingSet) AggregationNode(io.prestosql.spi.plan.AggregationNode) JOIN_REORDERING_STRATEGY(io.prestosql.SystemSessionProperties.JOIN_REORDERING_STRATEGY) PlanMatchPattern.values(io.prestosql.sql.planner.assertions.PlanMatchPattern.values) Slices(io.airlift.slice.Slices) Map(java.util.Map) Domain.singleValue(io.prestosql.spi.predicate.Domain.singleValue) PlanMatchPattern.node(io.prestosql.sql.planner.assertions.PlanMatchPattern.node) Slices.utf8Slice(io.airlift.slice.Slices.utf8Slice) PlanMatchPattern.strictTableScan(io.prestosql.sql.planner.assertions.PlanMatchPattern.strictTableScan) PlanMatchPattern.expression(io.prestosql.sql.planner.assertions.PlanMatchPattern.expression) Assert.assertFalse(org.testng.Assert.assertFalse) PlanMatchPattern.join(io.prestosql.sql.planner.assertions.PlanMatchPattern.join) PlanMatchPattern.strictProject(io.prestosql.sql.planner.assertions.PlanMatchPattern.strictProject) PlanMatchPattern(io.prestosql.sql.planner.assertions.PlanMatchPattern) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) TableScanNode(io.prestosql.spi.plan.TableScanNode) PlanNode(io.prestosql.spi.plan.PlanNode) ProjectNode(io.prestosql.spi.plan.ProjectNode) VarcharType.createVarcharType(io.prestosql.spi.type.VarcharType.createVarcharType) MorePredicates(io.prestosql.util.MorePredicates) CheckSubqueryNodesAreRewritten(io.prestosql.sql.planner.optimizations.CheckSubqueryNodesAreRewritten) LongLiteral(io.prestosql.sql.tree.LongLiteral) QueryTemplate(io.prestosql.tests.QueryTemplate) LOCAL(io.prestosql.sql.planner.plan.ExchangeNode.Scope.LOCAL) Domain(io.prestosql.spi.predicate.Domain) INNER(io.prestosql.spi.plan.JoinNode.Type.INNER) StatisticsWriterNode(io.prestosql.sql.planner.plan.StatisticsWriterNode) DistinctLimitNode(io.prestosql.sql.planner.plan.DistinctLimitNode) OPTIMIZED(io.prestosql.sql.planner.LogicalPlanner.Stage.OPTIMIZED) JOIN_DISTRIBUTION_TYPE(io.prestosql.SystemSessionProperties.JOIN_DISTRIBUTION_TYPE) QueryTemplate.queryTemplate(io.prestosql.tests.QueryTemplate.queryTemplate) PlanMatchPattern.equiJoinClause(io.prestosql.sql.planner.assertions.PlanMatchPattern.equiJoinClause) PlanMatchPattern.assignUniqueId(io.prestosql.sql.planner.assertions.PlanMatchPattern.assignUniqueId) REMOTE(io.prestosql.sql.planner.plan.ExchangeNode.Scope.REMOTE) GATHER(io.prestosql.sql.planner.plan.ExchangeNode.Type.GATHER) SINGLE(io.prestosql.spi.plan.AggregationNode.Step.SINGLE) REPARTITION(io.prestosql.sql.planner.plan.ExchangeNode.Type.REPARTITION) Session(io.prestosql.Session) PlanMatchPattern.anyTree(io.prestosql.sql.planner.assertions.PlanMatchPattern.anyTree) MoreCollectors.toOptional(com.google.common.collect.MoreCollectors.toOptional) PlanMatchPattern.output(io.prestosql.sql.planner.assertions.PlanMatchPattern.output) ExpressionMatcher(io.prestosql.sql.planner.assertions.ExpressionMatcher) PlanMatchPattern.anyNot(io.prestosql.sql.planner.assertions.PlanMatchPattern.anyNot) ValuesNode(io.prestosql.spi.plan.ValuesNode) DESCENDING(io.prestosql.sql.tree.SortItem.Ordering.DESCENDING) ColumnHandle(io.prestosql.spi.connector.ColumnHandle) LimitNode(io.prestosql.spi.plan.LimitNode) PlanOptimizer(io.prestosql.sql.planner.optimizations.PlanOptimizer) PlanMatchPattern.any(io.prestosql.sql.planner.assertions.PlanMatchPattern.any) PlanMatchPattern.aggregation(io.prestosql.sql.planner.assertions.PlanMatchPattern.aggregation) PlanMatchPattern.project(io.prestosql.sql.planner.assertions.PlanMatchPattern.project) PlanMatchPattern.tableScan(io.prestosql.sql.planner.assertions.PlanMatchPattern.tableScan) TpchColumnHandle(io.prestosql.plugin.tpch.TpchColumnHandle) PlanMatchPattern.semiJoin(io.prestosql.sql.planner.assertions.PlanMatchPattern.semiJoin) RowNumberSymbolMatcher(io.prestosql.sql.planner.assertions.RowNumberSymbolMatcher) ExchangeNode(io.prestosql.sql.planner.plan.ExchangeNode) FilterNode(io.prestosql.spi.plan.FilterNode) JoinReorderingStrategy(io.prestosql.sql.analyzer.FeaturesConfig.JoinReorderingStrategy) FORCE_SINGLE_NODE_OUTPUT(io.prestosql.SystemSessionProperties.FORCE_SINGLE_NODE_OUTPUT) ASC_NULLS_LAST(io.prestosql.spi.block.SortOrder.ASC_NULLS_LAST) ApplyNode(io.prestosql.sql.planner.plan.ApplyNode) ImmutableMap(com.google.common.collect.ImmutableMap) Predicate(java.util.function.Predicate) FINAL(io.prestosql.spi.plan.AggregationNode.Step.FINAL) IndexJoinNode(io.prestosql.sql.planner.plan.IndexJoinNode) FILTERING_SEMI_JOIN_TO_INNER(io.prestosql.SystemSessionProperties.FILTERING_SEMI_JOIN_TO_INNER) String.format(java.lang.String.format) PlanMatchPattern.sort(io.prestosql.sql.planner.assertions.PlanMatchPattern.sort) List(java.util.List) EnforceSingleRowNode(io.prestosql.sql.planner.plan.EnforceSingleRowNode) MorePredicates.isInstanceOfAny(io.prestosql.util.MorePredicates.isInstanceOfAny) TopNNode(io.prestosql.spi.plan.TopNNode) Entry(java.util.Map.Entry) Optional(java.util.Optional) PlanMatchPattern.topNRankingNumber(io.prestosql.sql.planner.assertions.PlanMatchPattern.topNRankingNumber) PlanMatchPattern.topN(io.prestosql.sql.planner.assertions.PlanMatchPattern.topN) PlanMatchPattern.constrainedTableScan(io.prestosql.sql.planner.assertions.PlanMatchPattern.constrainedTableScan) LAST(io.prestosql.sql.tree.SortItem.NullOrdering.LAST) TpchTableHandle(io.prestosql.plugin.tpch.TpchTableHandle) LateralJoinNode(io.prestosql.sql.planner.plan.LateralJoinNode) PlanMatchPattern.apply(io.prestosql.sql.planner.assertions.PlanMatchPattern.apply) Assert.assertEquals(org.testng.Assert.assertEquals) PARTITIONED(io.prestosql.spi.plan.JoinNode.DistributionType.PARTITIONED) PARTIAL(io.prestosql.spi.plan.AggregationNode.Step.PARTIAL) SemiJoinNode(io.prestosql.sql.planner.plan.SemiJoinNode) SUBQUERY_MULTIPLE_ROWS(io.prestosql.spi.StandardErrorCode.SUBQUERY_MULTIPLE_ROWS) ImmutableList(com.google.common.collect.ImmutableList) Range(io.prestosql.spi.predicate.Range) PlanMatchPattern.functionCall(io.prestosql.sql.planner.assertions.PlanMatchPattern.functionCall) PlanMatchPattern.filter(io.prestosql.sql.planner.assertions.PlanMatchPattern.filter) PlanMatchPattern.exchange(io.prestosql.sql.planner.assertions.PlanMatchPattern.exchange) REPLICATE(io.prestosql.sql.planner.plan.ExchangeNode.Type.REPLICATE) JoinNode(io.prestosql.spi.plan.JoinNode) Symbol(io.prestosql.spi.plan.Symbol) BasePlanTest(io.prestosql.sql.planner.assertions.BasePlanTest) ASCENDING(io.prestosql.sql.tree.SortItem.Ordering.ASCENDING) DISTRIBUTED_SORT(io.prestosql.SystemSessionProperties.DISTRIBUTED_SORT) TupleDomain(io.prestosql.spi.predicate.TupleDomain) PlanMatchPattern.limit(io.prestosql.sql.planner.assertions.PlanMatchPattern.limit) Consumer(java.util.function.Consumer) PlanNodeSearcher.searchFrom(io.prestosql.sql.planner.optimizations.PlanNodeSearcher.searchFrom) PlanMatchPattern.rowNumber(io.prestosql.sql.planner.assertions.PlanMatchPattern.rowNumber) AddLocalExchanges(io.prestosql.sql.planner.optimizations.AddLocalExchanges) LEFT(io.prestosql.spi.plan.JoinNode.Type.LEFT) PlanMatchPattern.constrainedTableScanWithTableLayout(io.prestosql.sql.planner.assertions.PlanMatchPattern.constrainedTableScanWithTableLayout) TableScanNode(io.prestosql.spi.plan.TableScanNode) ExchangeNode(io.prestosql.sql.planner.plan.ExchangeNode) IndexJoinNode(io.prestosql.sql.planner.plan.IndexJoinNode) LateralJoinNode(io.prestosql.sql.planner.plan.LateralJoinNode) SemiJoinNode(io.prestosql.sql.planner.plan.SemiJoinNode) JoinNode(io.prestosql.spi.plan.JoinNode) Symbol(io.prestosql.spi.plan.Symbol) PlanMatchPattern(io.prestosql.sql.planner.assertions.PlanMatchPattern) AggregationNode(io.prestosql.spi.plan.AggregationNode) Session(io.prestosql.Session) Test(org.testng.annotations.Test) BasePlanTest(io.prestosql.sql.planner.assertions.BasePlanTest)

Example 2 with REPLICATE

use of io.prestosql.sql.planner.plan.ExchangeNode.Type.REPLICATE in project hetu-core by openlookeng.

the class PlanFragmenter method analyzeGroupedExecution.

private SubPlan analyzeGroupedExecution(Session session, SubPlan subPlan) {
    PlanFragment fragment = subPlan.getFragment();
    GroupedExecutionProperties properties = fragment.getRoot().accept(new GroupedExecutionTagger(session, metadata, nodePartitioningManager), null);
    if (properties.isSubTreeUseful()) {
        boolean preferDynamic = fragment.getRemoteSourceNodes().stream().allMatch(node -> node.getExchangeType() == REPLICATE) && isDynamicSchduleForGroupedExecution(session);
        BucketNodeMap bucketNodeMap = nodePartitioningManager.getBucketNodeMap(session, fragment.getPartitioning(), preferDynamic);
        if (bucketNodeMap.isDynamic()) {
            fragment = fragment.withDynamicLifespanScheduleGroupedExecution(properties.getCapableTableScanNodes());
        } else {
            fragment = fragment.withFixedLifespanScheduleGroupedExecution(properties.getCapableTableScanNodes());
        }
    }
    ImmutableList.Builder<SubPlan> result = ImmutableList.builder();
    for (SubPlan child : subPlan.getChildren()) {
        result.add(analyzeGroupedExecution(session, child));
    }
    return new SubPlan(fragment, result.build());
}
Also used : TableDeleteNode(io.prestosql.sql.planner.plan.TableDeleteNode) Lookup(io.prestosql.sql.planner.iterative.Lookup) PrestoWarning(io.prestosql.spi.PrestoWarning) SystemSessionProperties(io.prestosql.SystemSessionProperties) PlanFragmentId(io.prestosql.sql.planner.plan.PlanFragmentId) QueryManagerConfig(io.prestosql.execution.QueryManagerConfig) CTEScanNode(io.prestosql.spi.plan.CTEScanNode) AggregationNode(io.prestosql.spi.plan.AggregationNode) TableFinishNode(io.prestosql.sql.planner.plan.TableFinishNode) ExchangeNode(io.prestosql.sql.planner.plan.ExchangeNode) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) Predicates.in(com.google.common.base.Predicates.in) WarningCollector(io.prestosql.execution.warnings.WarningCollector) SystemSessionProperties.isForceSingleNodeOutput(io.prestosql.SystemSessionProperties.isForceSingleNodeOutput) TableUpdateNode(io.prestosql.sql.planner.plan.TableUpdateNode) Map(java.util.Map) OutputNode(io.prestosql.sql.planner.plan.OutputNode) PlanPrinter.jsonFragmentPlan(io.prestosql.sql.planner.planprinter.PlanPrinter.jsonFragmentPlan) Type(io.prestosql.spi.type.Type) TopNRankingNumberNode(io.prestosql.sql.planner.plan.TopNRankingNumberNode) PlanNodeId(io.prestosql.spi.plan.PlanNodeId) PrestoException(io.prestosql.spi.PrestoException) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) TableScanNode(io.prestosql.spi.plan.TableScanNode) Set(java.util.Set) PlanNode(io.prestosql.spi.plan.PlanNode) TOO_MANY_STAGES(io.prestosql.spi.connector.StandardWarningCode.TOO_MANY_STAGES) ProjectNode(io.prestosql.spi.plan.ProjectNode) CubeFinishNode(io.prestosql.sql.planner.plan.CubeFinishNode) BucketNodeMap(io.prestosql.execution.scheduler.BucketNodeMap) Metadata(io.prestosql.metadata.Metadata) RowNumberNode(io.prestosql.sql.planner.plan.RowNumberNode) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) List(java.util.List) SOURCE_DISTRIBUTION(io.prestosql.sql.planner.SystemPartitioningHandle.SOURCE_DISTRIBUTION) ConnectorPartitionHandle(io.prestosql.spi.connector.ConnectorPartitionHandle) StatisticsWriterNode(io.prestosql.sql.planner.plan.StatisticsWriterNode) VacuumTableNode(io.prestosql.sql.planner.plan.VacuumTableNode) Optional(java.util.Optional) NOT_PARTITIONED(io.prestosql.spi.connector.NotPartitionedPartitionHandle.NOT_PARTITIONED) InternalPlanVisitor(io.prestosql.sql.planner.plan.InternalPlanVisitor) SINGLE_DISTRIBUTION(io.prestosql.sql.planner.SystemPartitioningHandle.SINGLE_DISTRIBUTION) StatsAndCosts(io.prestosql.cost.StatsAndCosts) RemoteSourceNode(io.prestosql.sql.planner.plan.RemoteSourceNode) HashMap(java.util.HashMap) TableHandle(io.prestosql.spi.metadata.TableHandle) ArrayList(java.util.ArrayList) Inject(javax.inject.Inject) HashSet(java.util.HashSet) REMOTE(io.prestosql.sql.planner.plan.ExchangeNode.Scope.REMOTE) SystemSessionProperties.isDynamicSchduleForGroupedExecution(io.prestosql.SystemSessionProperties.isDynamicSchduleForGroupedExecution) QUERY_HAS_TOO_MANY_STAGES(io.prestosql.spi.StandardErrorCode.QUERY_HAS_TOO_MANY_STAGES) ImmutableList(com.google.common.collect.ImmutableList) Objects.requireNonNull(java.util.Objects.requireNonNull) Session(io.prestosql.Session) SimplePlanRewriter(io.prestosql.sql.planner.plan.SimplePlanRewriter) ConnectorPartitioningHandle(io.prestosql.spi.connector.ConnectorPartitioningHandle) COORDINATOR_DISTRIBUTION(io.prestosql.sql.planner.SystemPartitioningHandle.COORDINATOR_DISTRIBUTION) REPLICATE(io.prestosql.sql.planner.plan.ExchangeNode.Type.REPLICATE) JoinNode(io.prestosql.spi.plan.JoinNode) Symbol(io.prestosql.spi.plan.Symbol) TableWriterNode(io.prestosql.sql.planner.plan.TableWriterNode) StageExecutionDescriptor.ungroupedExecution(io.prestosql.operator.StageExecutionDescriptor.ungroupedExecution) SchedulingOrderVisitor.scheduleOrder(io.prestosql.sql.planner.SchedulingOrderVisitor.scheduleOrder) TablePartitioning(io.prestosql.metadata.TableProperties.TablePartitioning) Iterables.getOnlyElement(com.google.common.collect.Iterables.getOnlyElement) Maps(com.google.common.collect.Maps) ValuesNode(io.prestosql.spi.plan.ValuesNode) SystemSessionProperties.getQueryMaxStageCount(io.prestosql.SystemSessionProperties.getQueryMaxStageCount) WindowNode(io.prestosql.spi.plan.WindowNode) ExplainAnalyzeNode(io.prestosql.sql.planner.plan.ExplainAnalyzeNode) BucketNodeMap(io.prestosql.execution.scheduler.BucketNodeMap) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList)

Example 3 with REPLICATE

use of io.prestosql.sql.planner.plan.ExchangeNode.Type.REPLICATE in project hetu-core by openlookeng.

the class SqlQueryScheduler method createStages.

private List<SqlStageExecution> createStages(ExchangeLocationsConsumer parent, AtomicInteger nextStageId, LocationFactory locationFactory, StageExecutionPlan plan, NodeScheduler nodeScheduler, RemoteTaskFactory remoteTaskFactory, Session session, int splitBatchSize, BiFunction<PartitioningHandle, Integer, NodePartitionMap> partitioningCache, NodePartitioningManager nodePartitioningManager, ExecutorService queryExecutor, ScheduledExecutorService schedulerExecutor, FailureDetector failureDetector, NodeTaskMap nodeTaskMap, ImmutableMap.Builder<StageId, StageScheduler> stageSchedulers, ImmutableMap.Builder<StageId, StageLinkage> stageLinkages, boolean isSnapshotEnabled, QuerySnapshotManager snapshotManager, Map<StageId, Integer> stageTaskCounts) {
    ImmutableList.Builder<SqlStageExecution> localStages = ImmutableList.builder();
    StageId stageId = new StageId(queryStateMachine.getQueryId(), nextStageId.getAndIncrement());
    SqlStageExecution stageExecution = createSqlStageExecution(stageId, locationFactory.createStageLocation(stageId), plan.getFragment(), plan.getTables(), remoteTaskFactory, session, summarizeTaskInfo, nodeTaskMap, queryExecutor, failureDetector, schedulerStats, dynamicFilterService, snapshotManager);
    localStages.add(stageExecution);
    Optional<int[]> bucketToPartition;
    PartitioningHandle partitioningHandle = plan.getFragment().getPartitioning();
    boolean keepConsumerOnFeederNodes = !plan.getFragment().getFeederCTEId().isPresent() && plan.getFragment().getFeederCTEParentId().isPresent();
    if (partitioningHandle.equals(SOURCE_DISTRIBUTION)) {
        // nodes are selected dynamically based on the constraints of the splits and the system load
        Entry<PlanNodeId, SplitSource> entry = Iterables.getOnlyElement(plan.getSplitSources().entrySet());
        PlanNodeId planNodeId = entry.getKey();
        SplitSource splitSource = entry.getValue();
        CatalogName catalogName = splitSource.getCatalogName();
        if (isInternalSystemConnector(catalogName)) {
            catalogName = null;
        }
        NodeSelector nodeSelector = nodeScheduler.createNodeSelector(catalogName, keepConsumerOnFeederNodes, feederScheduledNodes);
        if (isSnapshotEnabled) {
            // When snapshot is enabled, then no task can be added after the query started running,
            // otherwise assumptions about how many "input channels" may be broken.
            nodeSelector.lockDownNodes();
        }
        SplitPlacementPolicy placementPolicy = new DynamicSplitPlacementPolicy(nodeSelector, stageExecution::getAllTasks);
        checkArgument(!plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution());
        stageSchedulers.put(stageId, newSourcePartitionedSchedulerAsStageScheduler(stageExecution, planNodeId, splitSource, placementPolicy, splitBatchSize, session, heuristicIndexerManager));
        bucketToPartition = Optional.of(new int[1]);
    } else if (partitioningHandle.equals(SCALED_WRITER_DISTRIBUTION)) {
        bucketToPartition = Optional.of(new int[1]);
    } else {
        Map<PlanNodeId, SplitSource> splitSources = plan.getSplitSources();
        if (!splitSources.isEmpty()) {
            // contains local source
            List<PlanNodeId> schedulingOrder = plan.getFragment().getPartitionedSources();
            CatalogName catalogName = partitioningHandle.getConnectorId().orElseThrow(IllegalStateException::new);
            List<ConnectorPartitionHandle> connectorPartitionHandles;
            boolean groupedExecutionForStage = plan.getFragment().getStageExecutionDescriptor().isStageGroupedExecution();
            if (groupedExecutionForStage) {
                connectorPartitionHandles = nodePartitioningManager.listPartitionHandles(session, partitioningHandle);
                checkState(!ImmutableList.of(NOT_PARTITIONED).equals(connectorPartitionHandles));
            } else {
                connectorPartitionHandles = ImmutableList.of(NOT_PARTITIONED);
            }
            BucketNodeMap bucketNodeMap;
            List<InternalNode> stageNodeList;
            if (plan.getFragment().getRemoteSourceNodes().stream().allMatch(node -> node.getExchangeType() == REPLICATE)) {
                // no remote source
                boolean dynamicLifespanSchedule = plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule();
                if (isSnapshotEnabled) {
                    NodeSelector nodeSelector = nodeScheduler.createNodeSelector(catalogName, keepConsumerOnFeederNodes, feederScheduledNodes);
                    int nodeCount;
                    if (stageTaskCounts != null) {
                        // Resuming: need to create same number of tasks as old stage.
                        nodeCount = stageTaskCounts.get(stageId);
                    } else {
                        // Scheduling: reserve some nodes for resuming
                        nodeCount = calculateTaskCount(nodeSelector.selectableNodeCount());
                    }
                    stageNodeList = new ArrayList<>(nodeSelector.selectRandomNodes(nodeCount));
                    checkCondition(stageNodeList.size() == nodeCount, NO_NODES_AVAILABLE, "Snapshot: not enough worker nodes to resume expected number of tasks: " + nodeCount);
                    // Make sure bucketNodeMap uses the same node list
                    bucketNodeMap = nodePartitioningManager.getBucketNodeMap(session, partitioningHandle, dynamicLifespanSchedule, stageNodeList);
                } else {
                    bucketNodeMap = nodePartitioningManager.getBucketNodeMap(session, partitioningHandle, dynamicLifespanSchedule);
                    stageNodeList = new ArrayList<>(nodeScheduler.createNodeSelector(catalogName, keepConsumerOnFeederNodes, feederScheduledNodes).allNodes());
                }
                // verify execution is consistent with planner's decision on dynamic lifespan schedule
                verify(bucketNodeMap.isDynamic() == dynamicLifespanSchedule);
                Collections.shuffle(stageNodeList);
                bucketToPartition = Optional.empty();
            } else {
                // cannot use dynamic lifespan schedule
                verify(!plan.getFragment().getStageExecutionDescriptor().isDynamicLifespanSchedule());
                // remote source requires nodePartitionMap
                NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning(), stageTaskCounts == null ? null : stageTaskCounts.get(stageId));
                if (groupedExecutionForStage) {
                    checkState(connectorPartitionHandles.size() == nodePartitionMap.getBucketToPartition().length);
                }
                stageNodeList = nodePartitionMap.getPartitionToNode();
                bucketNodeMap = nodePartitionMap.asBucketNodeMap();
                bucketToPartition = Optional.of(nodePartitionMap.getBucketToPartition());
            }
            stageSchedulers.put(stageId, new FixedSourcePartitionedScheduler(stageExecution, splitSources, plan.getFragment().getStageExecutionDescriptor(), schedulingOrder, stageNodeList, bucketNodeMap, splitBatchSize, getConcurrentLifespansPerNode(session), nodeScheduler.createNodeSelector(catalogName, keepConsumerOnFeederNodes, feederScheduledNodes), connectorPartitionHandles, session, heuristicIndexerManager));
        } else {
            // all sources are remote
            NodePartitionMap nodePartitionMap = partitioningCache.apply(plan.getFragment().getPartitioning(), stageTaskCounts == null ? null : stageTaskCounts.get(stageId));
            List<InternalNode> partitionToNode = nodePartitionMap.getPartitionToNode();
            // todo this should asynchronously wait a standard timeout period before failing
            checkCondition(!partitionToNode.isEmpty(), NO_NODES_AVAILABLE, "No worker nodes available");
            stageSchedulers.put(stageId, new FixedCountScheduler(stageExecution, partitionToNode));
            bucketToPartition = Optional.of(nodePartitionMap.getBucketToPartition());
        }
    }
    ImmutableSet.Builder<SqlStageExecution> childStagesBuilder = ImmutableSet.builder();
    for (StageExecutionPlan subStagePlan : plan.getSubStages()) {
        if (visitedPlanFrags.contains(subStagePlan.getFragment().getId())) {
            continue;
        }
        visitedPlanFrags.add(subStagePlan.getFragment().getId());
        List<SqlStageExecution> subTree = createStages(stageExecution::addExchangeLocations, nextStageId, locationFactory, subStagePlan.withBucketToPartition(bucketToPartition), nodeScheduler, remoteTaskFactory, session, splitBatchSize, partitioningCache, nodePartitioningManager, queryExecutor, schedulerExecutor, failureDetector, nodeTaskMap, stageSchedulers, stageLinkages, isSnapshotEnabled, snapshotManager, stageTaskCounts);
        localStages.addAll(subTree);
        SqlStageExecution childStage = subTree.get(0);
        childStagesBuilder.add(childStage);
        Optional<RemoteSourceNode> parentNode = plan.getFragment().getRemoteSourceNodes().stream().filter(x -> x.getSourceFragmentIds().contains(childStage.getFragment().getId())).findAny();
        checkArgument(parentNode.isPresent(), "Couldn't find parent of a CTE node");
        childStage.setParentId(parentNode.get().getId());
    }
    Set<SqlStageExecution> childStages = childStagesBuilder.build();
    stageExecution.addStateChangeListener(newState -> {
        if (newState.isDone() && newState != StageState.RESCHEDULING) {
            // Snapshot: For "rescheduling", tasks are already cancelled (for resume)
            childStages.forEach(SqlStageExecution::cancel);
        }
    });
    stageLinkages.put(stageId, new StageLinkage(plan.getFragment().getId(), parent, childStages));
    if (partitioningHandle.equals(SCALED_WRITER_DISTRIBUTION)) {
        Supplier<Collection<TaskStatus>> sourceTasksProvider = () -> childStages.stream().map(SqlStageExecution::getAllTasks).flatMap(Collection::stream).map(RemoteTask::getTaskStatus).collect(toList());
        Supplier<Collection<TaskStatus>> writerTasksProvider = () -> stageExecution.getAllTasks().stream().map(RemoteTask::getTaskStatus).collect(toList());
        ScaledWriterScheduler scheduler = new ScaledWriterScheduler(stageExecution, sourceTasksProvider, writerTasksProvider, nodeScheduler.createNodeSelector(null, keepConsumerOnFeederNodes, feederScheduledNodes), schedulerExecutor, getWriterMinSize(session), isSnapshotEnabled, stageTaskCounts != null ? stageTaskCounts.get(stageId) : null);
        whenAllStages(childStages, StageState::isDone).addListener(scheduler::finish, directExecutor());
        stageSchedulers.put(stageId, scheduler);
    }
    return localStages.build();
}
Also used : CANCELED(io.prestosql.execution.StageState.CANCELED) SCHEDULED(io.prestosql.execution.StageState.SCHEDULED) PlanFragmentId(io.prestosql.sql.planner.plan.PlanFragmentId) NO_NODES_AVAILABLE(io.prestosql.spi.StandardErrorCode.NO_NODES_AVAILABLE) FIXED_BROADCAST_DISTRIBUTION(io.prestosql.sql.planner.SystemPartitioningHandle.FIXED_BROADCAST_DISTRIBUTION) StageExecutionPlan(io.prestosql.sql.planner.StageExecutionPlan) Map(java.util.Map) SystemSessionProperties.getWriterMinSize(io.prestosql.SystemSessionProperties.getWriterMinSize) HeuristicIndexerManager(io.prestosql.heuristicindex.HeuristicIndexerManager) PlanNodeId(io.prestosql.spi.plan.PlanNodeId) TaskStatus(io.prestosql.execution.TaskStatus) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) Set(java.util.Set) LocationFactory(io.prestosql.execution.LocationFactory) MILLISECONDS(java.util.concurrent.TimeUnit.MILLISECONDS) MoreExecutors.directExecutor(com.google.common.util.concurrent.MoreExecutors.directExecutor) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) StageState(io.prestosql.execution.StageState) ConnectorPartitionHandle(io.prestosql.spi.connector.ConnectorPartitionHandle) GENERIC_INTERNAL_ERROR(io.prestosql.spi.StandardErrorCode.GENERIC_INTERNAL_ERROR) SetThreadName(io.airlift.concurrent.SetThreadName) Iterables(com.google.common.collect.Iterables) RESUMABLE_FAILURE(io.prestosql.execution.StageState.RESUMABLE_FAILURE) Supplier(java.util.function.Supplier) SCALED_WRITER_DISTRIBUTION(io.prestosql.sql.planner.SystemPartitioningHandle.SCALED_WRITER_DISTRIBUTION) QueryStateMachine(io.prestosql.execution.QueryStateMachine) ArrayList(java.util.ArrayList) CatalogName.isInternalSystemConnector(io.prestosql.spi.connector.CatalogName.isInternalSystemConnector) Session(io.prestosql.Session) ScheduledExecutorService(java.util.concurrent.ScheduledExecutorService) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) NodeTaskMap(io.prestosql.execution.NodeTaskMap) SplitSource(io.prestosql.split.SplitSource) FINISHED(io.prestosql.execution.StageState.FINISHED) StageId(io.prestosql.execution.StageId) InternalNode(io.prestosql.metadata.InternalNode) Sets.newConcurrentHashSet(com.google.common.collect.Sets.newConcurrentHashSet) QuerySnapshotManager(io.prestosql.snapshot.QuerySnapshotManager) ResourceGroupInfo(io.prestosql.server.ResourceGroupInfo) PartitioningHandle(io.prestosql.sql.planner.PartitioningHandle) QueryState(io.prestosql.execution.QueryState) NodePartitionMap(io.prestosql.sql.planner.NodePartitionMap) SqlStageExecution.createSqlStageExecution(io.prestosql.execution.SqlStageExecution.createSqlStageExecution) SqlStageExecution(io.prestosql.execution.SqlStageExecution) ABORTED(io.prestosql.execution.StageState.ABORTED) FailureDetector(io.prestosql.failuredetector.FailureDetector) RemoteTask(io.prestosql.execution.RemoteTask) FAILED(io.prestosql.execution.StageState.FAILED) SystemSessionProperties(io.prestosql.SystemSessionProperties) BiFunction(java.util.function.BiFunction) SettableFuture(com.google.common.util.concurrent.SettableFuture) SourcePartitionedScheduler.newSourcePartitionedSchedulerAsStageScheduler(io.prestosql.execution.scheduler.SourcePartitionedScheduler.newSourcePartitionedSchedulerAsStageScheduler) Duration(io.airlift.units.Duration) Preconditions.checkArgument(com.google.common.base.Preconditions.checkArgument) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) URI(java.net.URI) Collectors.toSet(java.util.stream.Collectors.toSet) PrestoException(io.prestosql.spi.PrestoException) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableMap(com.google.common.collect.ImmutableMap) Predicate(java.util.function.Predicate) Collection(java.util.Collection) CatalogName(io.prestosql.spi.connector.CatalogName) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) RemoteTaskFactory(io.prestosql.execution.RemoteTaskFactory) UUID(java.util.UUID) String.format(java.lang.String.format) Preconditions.checkState(com.google.common.base.Preconditions.checkState) List(java.util.List) SOURCE_DISTRIBUTION(io.prestosql.sql.planner.SystemPartitioningHandle.SOURCE_DISTRIBUTION) StageInfo(io.prestosql.execution.StageInfo) Entry(java.util.Map.Entry) HttpUriBuilder.uriBuilderFrom(io.airlift.http.client.HttpUriBuilder.uriBuilderFrom) Function.identity(java.util.function.Function.identity) Optional(java.util.Optional) MoreFutures.whenAnyComplete(io.airlift.concurrent.MoreFutures.whenAnyComplete) BasicStageStats(io.prestosql.execution.BasicStageStats) NodePartitioningManager(io.prestosql.sql.planner.NodePartitioningManager) NOT_PARTITIONED(io.prestosql.spi.connector.NotPartitionedPartitionHandle.NOT_PARTITIONED) RUNNING(io.prestosql.execution.StageState.RUNNING) TaskId(io.prestosql.execution.TaskId) ListenableFuture(com.google.common.util.concurrent.ListenableFuture) Logger(io.airlift.log.Logger) AtomicBoolean(java.util.concurrent.atomic.AtomicBoolean) RemoteSourceNode(io.prestosql.sql.planner.plan.RemoteSourceNode) HashMap(java.util.HashMap) OutputBuffers(io.prestosql.execution.buffer.OutputBuffers) TaskLocation(io.prestosql.operator.TaskLocation) HashSet(java.util.HashSet) SnapshotConfig.calculateTaskCount(io.prestosql.snapshot.SnapshotConfig.calculateTaskCount) ImmutableList(com.google.common.collect.ImmutableList) SystemSessionProperties.isReuseTableScanEnabled(io.prestosql.SystemSessionProperties.isReuseTableScanEnabled) OutputBufferId(io.prestosql.execution.buffer.OutputBuffers.OutputBufferId) Verify.verify(com.google.common.base.Verify.verify) Failures.checkCondition(io.prestosql.util.Failures.checkCondition) Objects.requireNonNull(java.util.Objects.requireNonNull) TimeStat(io.airlift.stats.TimeStat) REPLICATE(io.prestosql.sql.planner.plan.ExchangeNode.Type.REPLICATE) ExecutorService(java.util.concurrent.ExecutorService) Ints(com.google.common.primitives.Ints) DynamicFilterService(io.prestosql.dynamicfilter.DynamicFilterService) MoreFutures.tryGetFutureValue(io.airlift.concurrent.MoreFutures.tryGetFutureValue) SystemSessionProperties.getConcurrentLifespansPerNode(io.prestosql.SystemSessionProperties.getConcurrentLifespansPerNode) Collectors.toList(java.util.stream.Collectors.toList) Collections(java.util.Collections) SECONDS(java.util.concurrent.TimeUnit.SECONDS) BasicStageStats.aggregateBasicStageStats(io.prestosql.execution.BasicStageStats.aggregateBasicStageStats) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ImmutableList(com.google.common.collect.ImmutableList) StageExecutionPlan(io.prestosql.sql.planner.StageExecutionPlan) StageId(io.prestosql.execution.StageId) ArrayList(java.util.ArrayList) SqlStageExecution.createSqlStageExecution(io.prestosql.execution.SqlStageExecution.createSqlStageExecution) SqlStageExecution(io.prestosql.execution.SqlStageExecution) PlanNodeId(io.prestosql.spi.plan.PlanNodeId) RemoteSourceNode(io.prestosql.sql.planner.plan.RemoteSourceNode) ImmutableSet.toImmutableSet(com.google.common.collect.ImmutableSet.toImmutableSet) ImmutableSet(com.google.common.collect.ImmutableSet) ImmutableList.toImmutableList(com.google.common.collect.ImmutableList.toImmutableList) ArrayList(java.util.ArrayList) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList) Collectors.toList(java.util.stream.Collectors.toList) NodePartitionMap(io.prestosql.sql.planner.NodePartitionMap) Collection(java.util.Collection) CatalogName(io.prestosql.spi.connector.CatalogName) PartitioningHandle(io.prestosql.sql.planner.PartitioningHandle) SplitSource(io.prestosql.split.SplitSource) Map(java.util.Map) ImmutableMap.toImmutableMap(com.google.common.collect.ImmutableMap.toImmutableMap) NodeTaskMap(io.prestosql.execution.NodeTaskMap) NodePartitionMap(io.prestosql.sql.planner.NodePartitionMap) ImmutableMap(com.google.common.collect.ImmutableMap) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) HashMap(java.util.HashMap)

Aggregations

ImmutableList (com.google.common.collect.ImmutableList)3 ImmutableList.toImmutableList (com.google.common.collect.ImmutableList.toImmutableList)3 Session (io.prestosql.Session)3 Preconditions.checkArgument (com.google.common.base.Preconditions.checkArgument)2 Preconditions.checkState (com.google.common.base.Preconditions.checkState)2 ImmutableMap (com.google.common.collect.ImmutableMap)2 ImmutableSet (com.google.common.collect.ImmutableSet)2 SystemSessionProperties (io.prestosql.SystemSessionProperties)2 AggregationNode (io.prestosql.spi.plan.AggregationNode)2 JoinNode (io.prestosql.spi.plan.JoinNode)2 PlanNode (io.prestosql.spi.plan.PlanNode)2 ProjectNode (io.prestosql.spi.plan.ProjectNode)2 Symbol (io.prestosql.spi.plan.Symbol)2 TableScanNode (io.prestosql.spi.plan.TableScanNode)2 ValuesNode (io.prestosql.spi.plan.ValuesNode)2 ExchangeNode (io.prestosql.sql.planner.plan.ExchangeNode)2 REMOTE (io.prestosql.sql.planner.plan.ExchangeNode.Scope.REMOTE)2 REPLICATE (io.prestosql.sql.planner.plan.ExchangeNode.Type.REPLICATE)2 StatisticsWriterNode (io.prestosql.sql.planner.plan.StatisticsWriterNode)2 String.format (java.lang.String.format)2