use of org.apache.flink.optimizer.plan.PlanNode in project flink by apache.
the class Optimizer method compile.
/**
* Translates the given program to an OptimizedPlan. The optimized plan describes for each operator
* which strategy to use (such as hash join versus sort-merge join), what data exchange method to use
* (local pipe forward, shuffle, broadcast), what exchange mode to use (pipelined, batch),
* where to cache intermediate results, etc,
*
* The optimization happens in multiple phases:
* <ol>
* <li>Create optimizer dag implementation of the program.
*
* <tt>OptimizerNode</tt> representations of the PACTs, assign parallelism and compute size estimates.</li>
* <li>Compute interesting properties and auxiliary structures.</li>
* <li>Enumerate plan alternatives. This cannot be done in the same step as the interesting property computation (as
* opposed to the Database approaches), because we support plans that are not trees.</li>
* </ol>
*
* @param program The program to be translated.
* @param postPasser The function to be used for post passing the optimizer's plan and setting the
* data type specific serialization routines.
* @return The optimized plan.
*
* @throws CompilerException
* Thrown, if the plan is invalid or the optimizer encountered an inconsistent
* situation during the compilation process.
*/
private OptimizedPlan compile(Plan program, OptimizerPostPass postPasser) throws CompilerException {
if (program == null || postPasser == null) {
throw new NullPointerException();
}
if (LOG.isDebugEnabled()) {
LOG.debug("Beginning compilation of program '" + program.getJobName() + '\'');
}
final ExecutionMode defaultDataExchangeMode = program.getExecutionConfig().getExecutionMode();
final int defaultParallelism = program.getDefaultParallelism() > 0 ? program.getDefaultParallelism() : this.defaultParallelism;
// log the default settings
LOG.debug("Using a default parallelism of {}", defaultParallelism);
LOG.debug("Using default data exchange mode {}", defaultDataExchangeMode);
// the first step in the compilation is to create the optimizer plan representation
// this step does the following:
// 1) It creates an optimizer plan node for each operator
// 2) It connects them via channels
// 3) It looks for hints about local strategies and channel types and
// sets the types and strategies accordingly
// 4) It makes estimates about the data volume of the data sources and
// propagates those estimates through the plan
GraphCreatingVisitor graphCreator = new GraphCreatingVisitor(defaultParallelism, defaultDataExchangeMode);
program.accept(graphCreator);
// if we have a plan with multiple data sinks, add logical optimizer nodes that have two data-sinks as children
// each until we have only a single root node. This allows to transparently deal with the nodes with
// multiple outputs
OptimizerNode rootNode;
if (graphCreator.getSinks().size() == 1) {
rootNode = graphCreator.getSinks().get(0);
} else if (graphCreator.getSinks().size() > 1) {
Iterator<DataSinkNode> iter = graphCreator.getSinks().iterator();
rootNode = iter.next();
while (iter.hasNext()) {
rootNode = new SinkJoiner(rootNode, iter.next());
}
} else {
throw new CompilerException("Bug: The optimizer plan representation has no sinks.");
}
// now that we have all nodes created and recorded which ones consume memory, tell the nodes their minimal
// guaranteed memory, for further cost estimations. We assume an equal distribution of memory among consumer tasks
rootNode.accept(new IdAndEstimatesVisitor(this.statistics));
// We are dealing with operator DAGs, rather than operator trees.
// That requires us to deviate at some points from the classical DB optimizer algorithms.
// This step builds auxiliary structures to help track branches and joins in the DAG
BranchesVisitor branchingVisitor = new BranchesVisitor();
rootNode.accept(branchingVisitor);
// Propagate the interesting properties top-down through the graph
InterestingPropertyVisitor propsVisitor = new InterestingPropertyVisitor(this.costEstimator);
rootNode.accept(propsVisitor);
// perform a sanity check: the root may not have any unclosed branches
if (rootNode.getOpenBranches() != null && rootNode.getOpenBranches().size() > 0) {
throw new CompilerException("Bug: Logic for branching plans (non-tree plans) has an error, and does not " + "track the re-joining of branches correctly.");
}
// the final step is now to generate the actual plan alternatives
List<PlanNode> bestPlan = rootNode.getAlternativePlans(this.costEstimator);
if (bestPlan.size() != 1) {
throw new CompilerException("Error in compiler: more than one best plan was created!");
}
// check if the best plan's root is a data sink (single sink plan)
// if so, directly take it. if it is a sink joiner node, get its contained sinks
PlanNode bestPlanRoot = bestPlan.get(0);
List<SinkPlanNode> bestPlanSinks = new ArrayList<SinkPlanNode>(4);
if (bestPlanRoot instanceof SinkPlanNode) {
bestPlanSinks.add((SinkPlanNode) bestPlanRoot);
} else if (bestPlanRoot instanceof SinkJoinerPlanNode) {
((SinkJoinerPlanNode) bestPlanRoot).getDataSinks(bestPlanSinks);
}
// finalize the plan
OptimizedPlan plan = new PlanFinalizer().createFinalPlan(bestPlanSinks, program.getJobName(), program);
plan.accept(new BinaryUnionReplacer());
plan.accept(new RangePartitionRewriter(plan));
// post pass the plan. this is the phase where the serialization and comparator code is set
postPasser.postPass(plan);
return plan;
}
use of org.apache.flink.optimizer.plan.PlanNode in project flink by apache.
the class BinaryUnionNode method getAlternativePlans.
@Override
public List<PlanNode> getAlternativePlans(CostEstimator estimator) {
// check that union has only a single successor
if (this.getOutgoingConnections().size() > 1) {
throw new CompilerException("BinaryUnionNode has more than one successor.");
}
boolean childrenSkippedDueToReplicatedInput = false;
// check if we have a cached version
if (this.cachedPlans != null) {
return this.cachedPlans;
}
// step down to all producer nodes and calculate alternative plans
final List<? extends PlanNode> subPlans1 = getFirstPredecessorNode().getAlternativePlans(estimator);
final List<? extends PlanNode> subPlans2 = getSecondPredecessorNode().getAlternativePlans(estimator);
List<DagConnection> broadcastConnections = getBroadcastConnections();
if (broadcastConnections != null && broadcastConnections.size() > 0) {
throw new CompilerException("Found BroadcastVariables on a Union operation");
}
final ArrayList<PlanNode> outputPlans = new ArrayList<PlanNode>();
final List<Set<? extends NamedChannel>> broadcastPlanChannels = Collections.emptyList();
final BinaryUnionOpDescriptor operator = new BinaryUnionOpDescriptor();
final RequestedLocalProperties noLocalProps = new RequestedLocalProperties();
final ExecutionMode input1Mode = this.input1.getDataExchangeMode();
final ExecutionMode input2Mode = this.input2.getDataExchangeMode();
final int parallelism = getParallelism();
final int inParallelism1 = getFirstPredecessorNode().getParallelism();
final int inParallelism2 = getSecondPredecessorNode().getParallelism();
final boolean dopChange1 = parallelism != inParallelism1;
final boolean dopChange2 = parallelism != inParallelism2;
final boolean input1breakPipeline = this.input1.isBreakingPipeline();
final boolean input2breakPipeline = this.input2.isBreakingPipeline();
// create all candidates
for (PlanNode child1 : subPlans1) {
if (child1.getGlobalProperties().isFullyReplicated()) {
// fully replicated input is always locally forwarded if parallelism is not changed
if (dopChange1) {
// can not continue with this child
childrenSkippedDueToReplicatedInput = true;
continue;
} else {
this.input1.setShipStrategy(ShipStrategyType.FORWARD);
}
}
for (PlanNode child2 : subPlans2) {
if (child2.getGlobalProperties().isFullyReplicated()) {
// fully replicated input is always locally forwarded if parallelism is not changed
if (dopChange2) {
// can not continue with this child
childrenSkippedDueToReplicatedInput = true;
continue;
} else {
this.input2.setShipStrategy(ShipStrategyType.FORWARD);
}
}
// candidate at the joined branch plan.
if (!areBranchCompatible(child1, child2)) {
continue;
}
for (RequestedGlobalProperties igps : this.channelProps) {
// create a candidate channel for the first input. mark it cached, if the connection says so
Channel c1 = new Channel(child1, this.input1.getMaterializationMode());
if (this.input1.getShipStrategy() == null) {
// free to choose the ship strategy
igps.parameterizeChannel(c1, dopChange1, input1Mode, input1breakPipeline);
// ship strategy preserves/establishes them even under changing parallelisms
if (dopChange1 && !c1.getShipStrategy().isNetworkStrategy()) {
c1.getGlobalProperties().reset();
}
} else {
// ship strategy fixed by compiler hint
ShipStrategyType shipStrategy = this.input1.getShipStrategy();
DataExchangeMode exMode = DataExchangeMode.select(input1Mode, shipStrategy, input1breakPipeline);
if (this.keys1 != null) {
c1.setShipStrategy(this.input1.getShipStrategy(), this.keys1.toFieldList(), exMode);
} else {
c1.setShipStrategy(this.input1.getShipStrategy(), exMode);
}
if (dopChange1) {
c1.adjustGlobalPropertiesForFullParallelismChange();
}
}
// create a candidate channel for the second input. mark it cached, if the connection says so
Channel c2 = new Channel(child2, this.input2.getMaterializationMode());
if (this.input2.getShipStrategy() == null) {
// free to choose the ship strategy
igps.parameterizeChannel(c2, dopChange2, input2Mode, input2breakPipeline);
// ship strategy preserves/establishes them even under changing parallelisms
if (dopChange2 && !c2.getShipStrategy().isNetworkStrategy()) {
c2.getGlobalProperties().reset();
}
} else {
// ship strategy fixed by compiler hint
ShipStrategyType shipStrategy = this.input2.getShipStrategy();
DataExchangeMode exMode = DataExchangeMode.select(input2Mode, shipStrategy, input2breakPipeline);
if (this.keys2 != null) {
c2.setShipStrategy(this.input2.getShipStrategy(), this.keys2.toFieldList(), exMode);
} else {
c2.setShipStrategy(this.input2.getShipStrategy(), exMode);
}
if (dopChange2) {
c2.adjustGlobalPropertiesForFullParallelismChange();
}
}
// get the global properties and clear unique fields (not preserved anyways during the union)
GlobalProperties p1 = c1.getGlobalProperties();
GlobalProperties p2 = c2.getGlobalProperties();
p1.clearUniqueFieldCombinations();
p2.clearUniqueFieldCombinations();
// partitioned on that field.
if (!igps.isTrivial() && !(p1.equals(p2))) {
if (c1.getShipStrategy() == ShipStrategyType.FORWARD && c2.getShipStrategy() != ShipStrategyType.FORWARD) {
// adjust c2 to c1
c2 = c2.clone();
p1.parameterizeChannel(c2, dopChange2, input2Mode, input2breakPipeline);
} else if (c2.getShipStrategy() == ShipStrategyType.FORWARD && c1.getShipStrategy() != ShipStrategyType.FORWARD) {
// adjust c1 to c2
c1 = c1.clone();
p2.parameterizeChannel(c1, dopChange1, input1Mode, input1breakPipeline);
} else if (c1.getShipStrategy() == ShipStrategyType.FORWARD && c2.getShipStrategy() == ShipStrategyType.FORWARD) {
boolean adjustC1 = c1.getEstimatedOutputSize() <= 0 || c2.getEstimatedOutputSize() <= 0 || c1.getEstimatedOutputSize() <= c2.getEstimatedOutputSize();
if (adjustC1) {
c2 = c2.clone();
p1.parameterizeChannel(c2, dopChange2, input2Mode, input2breakPipeline);
} else {
c1 = c1.clone();
p2.parameterizeChannel(c1, dopChange1, input1Mode, input1breakPipeline);
}
} else {
// excluded by the check that the required strategies must match
throw new CompilerException("Bug in Plan Enumeration for Union Node.");
}
}
instantiate(operator, c1, c2, broadcastPlanChannels, outputPlans, estimator, igps, igps, noLocalProps, noLocalProps);
}
}
}
if (outputPlans.isEmpty()) {
if (childrenSkippedDueToReplicatedInput) {
throw new CompilerException("No plan meeting the requirements could be created @ " + this + ". Most likely reason: Invalid use of replicated input.");
} else {
throw new CompilerException("No plan meeting the requirements could be created @ " + this + ". Most likely reason: Too restrictive plan hints.");
}
}
// cost and prune the plans
for (PlanNode node : outputPlans) {
estimator.costOperator(node);
}
prunePlanAlternatives(outputPlans);
outputPlans.trimToSize();
this.cachedPlans = outputPlans;
return outputPlans;
}
use of org.apache.flink.optimizer.plan.PlanNode in project flink by apache.
the class DataSinkNode method getAlternativePlans.
// --------------------------------------------------------------------------------------------
// Recursive Optimization
// --------------------------------------------------------------------------------------------
@Override
public List<PlanNode> getAlternativePlans(CostEstimator estimator) {
// check if we have a cached version
if (this.cachedPlans != null) {
return this.cachedPlans;
}
// calculate alternative sub-plans for predecessor
List<? extends PlanNode> subPlans = getPredecessorNode().getAlternativePlans(estimator);
List<PlanNode> outputPlans = new ArrayList<PlanNode>();
final int parallelism = getParallelism();
final int inDop = getPredecessorNode().getParallelism();
final ExecutionMode executionMode = this.input.getDataExchangeMode();
final boolean dopChange = parallelism != inDop;
final boolean breakPipeline = this.input.isBreakingPipeline();
InterestingProperties ips = this.input.getInterestingProperties();
for (PlanNode p : subPlans) {
for (RequestedGlobalProperties gp : ips.getGlobalProperties()) {
for (RequestedLocalProperties lp : ips.getLocalProperties()) {
Channel c = new Channel(p);
gp.parameterizeChannel(c, dopChange, executionMode, breakPipeline);
lp.parameterizeChannel(c);
c.setRequiredLocalProps(lp);
c.setRequiredGlobalProps(gp);
// no need to check whether the created properties meet what we need in case
// of ordering or global ordering, because the only interesting properties we have
// are what we require
outputPlans.add(new SinkPlanNode(this, "DataSink (" + this.getOperator().getName() + ")", c));
}
}
}
// cost and prune the plans
for (PlanNode node : outputPlans) {
estimator.costOperator(node);
}
prunePlanAlternatives(outputPlans);
this.cachedPlans = outputPlans;
return outputPlans;
}
use of org.apache.flink.optimizer.plan.PlanNode in project flink by apache.
the class RangePartitionRewriter method rewriteRangePartitionChannel.
private List<Channel> rewriteRangePartitionChannel(Channel channel) {
final List<Channel> sourceNewOutputChannels = new ArrayList<>();
final PlanNode sourceNode = channel.getSource();
final PlanNode targetNode = channel.getTarget();
final int sourceParallelism = sourceNode.getParallelism();
final int targetParallelism = targetNode.getParallelism();
final Costs defaultZeroCosts = new Costs(0, 0, 0);
final TypeComparatorFactory<?> comparator = Utils.getShipComparator(channel, this.plan.getOriginalPlan().getExecutionConfig());
// 1. Fixed size sample in each partitions.
final int sampleSize = SAMPLES_PER_PARTITION * targetParallelism;
final SampleInPartition sampleInPartition = new SampleInPartition(false, sampleSize, SEED);
final TypeInformation<?> sourceOutputType = sourceNode.getOptimizerNode().getOperator().getOperatorInfo().getOutputType();
final TypeInformation<IntermediateSampleData> isdTypeInformation = TypeExtractor.getForClass(IntermediateSampleData.class);
final UnaryOperatorInformation sipOperatorInformation = new UnaryOperatorInformation(sourceOutputType, isdTypeInformation);
final MapPartitionOperatorBase sipOperatorBase = new MapPartitionOperatorBase(sampleInPartition, sipOperatorInformation, SIP_NAME);
final MapPartitionNode sipNode = new MapPartitionNode(sipOperatorBase);
final Channel sipChannel = new Channel(sourceNode, TempMode.NONE);
sipChannel.setShipStrategy(ShipStrategyType.FORWARD, DataExchangeMode.PIPELINED);
final SingleInputPlanNode sipPlanNode = new SingleInputPlanNode(sipNode, SIP_NAME, sipChannel, DriverStrategy.MAP_PARTITION);
sipNode.setParallelism(sourceParallelism);
sipPlanNode.setParallelism(sourceParallelism);
sipPlanNode.initProperties(new GlobalProperties(), new LocalProperties());
sipPlanNode.setCosts(defaultZeroCosts);
sipChannel.setTarget(sipPlanNode);
this.plan.getAllNodes().add(sipPlanNode);
sourceNewOutputChannels.add(sipChannel);
// 2. Fixed size sample in a single coordinator.
final SampleInCoordinator sampleInCoordinator = new SampleInCoordinator(false, sampleSize, SEED);
final UnaryOperatorInformation sicOperatorInformation = new UnaryOperatorInformation(isdTypeInformation, sourceOutputType);
final GroupReduceOperatorBase sicOperatorBase = new GroupReduceOperatorBase(sampleInCoordinator, sicOperatorInformation, SIC_NAME);
final GroupReduceNode sicNode = new GroupReduceNode(sicOperatorBase);
final Channel sicChannel = new Channel(sipPlanNode, TempMode.NONE);
sicChannel.setShipStrategy(ShipStrategyType.FORWARD, DataExchangeMode.PIPELINED);
final SingleInputPlanNode sicPlanNode = new SingleInputPlanNode(sicNode, SIC_NAME, sicChannel, DriverStrategy.ALL_GROUP_REDUCE);
sicNode.setParallelism(1);
sicPlanNode.setParallelism(1);
sicPlanNode.initProperties(new GlobalProperties(), new LocalProperties());
sicPlanNode.setCosts(defaultZeroCosts);
sicChannel.setTarget(sicPlanNode);
sipPlanNode.addOutgoingChannel(sicChannel);
this.plan.getAllNodes().add(sicPlanNode);
// 3. Use sampled data to build range boundaries.
final RangeBoundaryBuilder rangeBoundaryBuilder = new RangeBoundaryBuilder(comparator, targetParallelism);
final TypeInformation<CommonRangeBoundaries> rbTypeInformation = TypeExtractor.getForClass(CommonRangeBoundaries.class);
final UnaryOperatorInformation rbOperatorInformation = new UnaryOperatorInformation(sourceOutputType, rbTypeInformation);
final MapPartitionOperatorBase rbOperatorBase = new MapPartitionOperatorBase(rangeBoundaryBuilder, rbOperatorInformation, RB_NAME);
final MapPartitionNode rbNode = new MapPartitionNode(rbOperatorBase);
final Channel rbChannel = new Channel(sicPlanNode, TempMode.NONE);
rbChannel.setShipStrategy(ShipStrategyType.FORWARD, DataExchangeMode.PIPELINED);
final SingleInputPlanNode rbPlanNode = new SingleInputPlanNode(rbNode, RB_NAME, rbChannel, DriverStrategy.MAP_PARTITION);
rbNode.setParallelism(1);
rbPlanNode.setParallelism(1);
rbPlanNode.initProperties(new GlobalProperties(), new LocalProperties());
rbPlanNode.setCosts(defaultZeroCosts);
rbChannel.setTarget(rbPlanNode);
sicPlanNode.addOutgoingChannel(rbChannel);
this.plan.getAllNodes().add(rbPlanNode);
// 4. Take range boundaries as broadcast input and take the tuple of partition id and record as output.
final AssignRangeIndex assignRangeIndex = new AssignRangeIndex(comparator);
final TypeInformation<Tuple2> ariOutputTypeInformation = new TupleTypeInfo<>(BasicTypeInfo.INT_TYPE_INFO, sourceOutputType);
final UnaryOperatorInformation ariOperatorInformation = new UnaryOperatorInformation(sourceOutputType, ariOutputTypeInformation);
final MapPartitionOperatorBase ariOperatorBase = new MapPartitionOperatorBase(assignRangeIndex, ariOperatorInformation, ARI_NAME);
final MapPartitionNode ariNode = new MapPartitionNode(ariOperatorBase);
final Channel ariChannel = new Channel(sourceNode, TempMode.NONE);
// To avoid deadlock, set the DataExchangeMode of channel between source node and this to Batch.
ariChannel.setShipStrategy(ShipStrategyType.FORWARD, DataExchangeMode.BATCH);
final SingleInputPlanNode ariPlanNode = new SingleInputPlanNode(ariNode, ARI_NAME, ariChannel, DriverStrategy.MAP_PARTITION);
ariNode.setParallelism(sourceParallelism);
ariPlanNode.setParallelism(sourceParallelism);
ariPlanNode.initProperties(new GlobalProperties(), new LocalProperties());
ariPlanNode.setCosts(defaultZeroCosts);
ariChannel.setTarget(ariPlanNode);
this.plan.getAllNodes().add(ariPlanNode);
sourceNewOutputChannels.add(ariChannel);
final NamedChannel broadcastChannel = new NamedChannel("RangeBoundaries", rbPlanNode);
broadcastChannel.setShipStrategy(ShipStrategyType.BROADCAST, DataExchangeMode.PIPELINED);
broadcastChannel.setTarget(ariPlanNode);
List<NamedChannel> broadcastChannels = new ArrayList<>(1);
broadcastChannels.add(broadcastChannel);
ariPlanNode.setBroadcastInputs(broadcastChannels);
// 5. Remove the partition id.
final Channel partChannel = new Channel(ariPlanNode, TempMode.NONE);
final FieldList keys = new FieldList(0);
partChannel.setShipStrategy(ShipStrategyType.PARTITION_CUSTOM, keys, idPartitioner, DataExchangeMode.PIPELINED);
ariPlanNode.addOutgoingChannel(partChannel);
final RemoveRangeIndex partitionIDRemoveWrapper = new RemoveRangeIndex();
final UnaryOperatorInformation prOperatorInformation = new UnaryOperatorInformation(ariOutputTypeInformation, sourceOutputType);
final MapOperatorBase prOperatorBase = new MapOperatorBase(partitionIDRemoveWrapper, prOperatorInformation, PR_NAME);
final MapNode prRemoverNode = new MapNode(prOperatorBase);
final SingleInputPlanNode prPlanNode = new SingleInputPlanNode(prRemoverNode, PR_NAME, partChannel, DriverStrategy.MAP);
partChannel.setTarget(prPlanNode);
prRemoverNode.setParallelism(targetParallelism);
prPlanNode.setParallelism(targetParallelism);
GlobalProperties globalProperties = new GlobalProperties();
globalProperties.setRangePartitioned(new Ordering(0, null, Order.ASCENDING));
prPlanNode.initProperties(globalProperties, new LocalProperties());
prPlanNode.setCosts(defaultZeroCosts);
this.plan.getAllNodes().add(prPlanNode);
// 6. Connect to target node.
channel.setSource(prPlanNode);
channel.setShipStrategy(ShipStrategyType.FORWARD, DataExchangeMode.PIPELINED);
prPlanNode.addOutgoingChannel(channel);
return sourceNewOutputChannels;
}
use of org.apache.flink.optimizer.plan.PlanNode in project flink by apache.
the class Utils method getShipComparator.
public static TypeComparatorFactory<?> getShipComparator(Channel channel, ExecutionConfig executionConfig) {
PlanNode source = channel.getSource();
Operator<?> javaOp = source.getProgramOperator();
TypeInformation<?> type = javaOp.getOperatorInfo().getOutputType();
return createComparator(type, channel.getShipStrategyKeys(), getSortOrders(channel.getShipStrategyKeys(), channel.getShipStrategySortOrder()), executionConfig);
}
Aggregations