use of org.apache.flink.optimizer.traversals.BinaryUnionReplacer in project flink by apache.
the class Optimizer method compile.
/**
* Translates the given program to an OptimizedPlan. The optimized plan describes for each operator
* which strategy to use (such as hash join versus sort-merge join), what data exchange method to use
* (local pipe forward, shuffle, broadcast), what exchange mode to use (pipelined, batch),
* where to cache intermediate results, etc,
*
* The optimization happens in multiple phases:
* <ol>
* <li>Create optimizer dag implementation of the program.
*
* <tt>OptimizerNode</tt> representations of the PACTs, assign parallelism and compute size estimates.</li>
* <li>Compute interesting properties and auxiliary structures.</li>
* <li>Enumerate plan alternatives. This cannot be done in the same step as the interesting property computation (as
* opposed to the Database approaches), because we support plans that are not trees.</li>
* </ol>
*
* @param program The program to be translated.
* @param postPasser The function to be used for post passing the optimizer's plan and setting the
* data type specific serialization routines.
* @return The optimized plan.
*
* @throws CompilerException
* Thrown, if the plan is invalid or the optimizer encountered an inconsistent
* situation during the compilation process.
*/
private OptimizedPlan compile(Plan program, OptimizerPostPass postPasser) throws CompilerException {
if (program == null || postPasser == null) {
throw new NullPointerException();
}
if (LOG.isDebugEnabled()) {
LOG.debug("Beginning compilation of program '" + program.getJobName() + '\'');
}
final ExecutionMode defaultDataExchangeMode = program.getExecutionConfig().getExecutionMode();
final int defaultParallelism = program.getDefaultParallelism() > 0 ? program.getDefaultParallelism() : this.defaultParallelism;
// log the default settings
LOG.debug("Using a default parallelism of {}", defaultParallelism);
LOG.debug("Using default data exchange mode {}", defaultDataExchangeMode);
// the first step in the compilation is to create the optimizer plan representation
// this step does the following:
// 1) It creates an optimizer plan node for each operator
// 2) It connects them via channels
// 3) It looks for hints about local strategies and channel types and
// sets the types and strategies accordingly
// 4) It makes estimates about the data volume of the data sources and
// propagates those estimates through the plan
GraphCreatingVisitor graphCreator = new GraphCreatingVisitor(defaultParallelism, defaultDataExchangeMode);
program.accept(graphCreator);
// if we have a plan with multiple data sinks, add logical optimizer nodes that have two data-sinks as children
// each until we have only a single root node. This allows to transparently deal with the nodes with
// multiple outputs
OptimizerNode rootNode;
if (graphCreator.getSinks().size() == 1) {
rootNode = graphCreator.getSinks().get(0);
} else if (graphCreator.getSinks().size() > 1) {
Iterator<DataSinkNode> iter = graphCreator.getSinks().iterator();
rootNode = iter.next();
while (iter.hasNext()) {
rootNode = new SinkJoiner(rootNode, iter.next());
}
} else {
throw new CompilerException("Bug: The optimizer plan representation has no sinks.");
}
// now that we have all nodes created and recorded which ones consume memory, tell the nodes their minimal
// guaranteed memory, for further cost estimations. We assume an equal distribution of memory among consumer tasks
rootNode.accept(new IdAndEstimatesVisitor(this.statistics));
// We are dealing with operator DAGs, rather than operator trees.
// That requires us to deviate at some points from the classical DB optimizer algorithms.
// This step builds auxiliary structures to help track branches and joins in the DAG
BranchesVisitor branchingVisitor = new BranchesVisitor();
rootNode.accept(branchingVisitor);
// Propagate the interesting properties top-down through the graph
InterestingPropertyVisitor propsVisitor = new InterestingPropertyVisitor(this.costEstimator);
rootNode.accept(propsVisitor);
// perform a sanity check: the root may not have any unclosed branches
if (rootNode.getOpenBranches() != null && rootNode.getOpenBranches().size() > 0) {
throw new CompilerException("Bug: Logic for branching plans (non-tree plans) has an error, and does not " + "track the re-joining of branches correctly.");
}
// the final step is now to generate the actual plan alternatives
List<PlanNode> bestPlan = rootNode.getAlternativePlans(this.costEstimator);
if (bestPlan.size() != 1) {
throw new CompilerException("Error in compiler: more than one best plan was created!");
}
// check if the best plan's root is a data sink (single sink plan)
// if so, directly take it. if it is a sink joiner node, get its contained sinks
PlanNode bestPlanRoot = bestPlan.get(0);
List<SinkPlanNode> bestPlanSinks = new ArrayList<SinkPlanNode>(4);
if (bestPlanRoot instanceof SinkPlanNode) {
bestPlanSinks.add((SinkPlanNode) bestPlanRoot);
} else if (bestPlanRoot instanceof SinkJoinerPlanNode) {
((SinkJoinerPlanNode) bestPlanRoot).getDataSinks(bestPlanSinks);
}
// finalize the plan
OptimizedPlan plan = new PlanFinalizer().createFinalPlan(bestPlanSinks, program.getJobName(), program);
plan.accept(new BinaryUnionReplacer());
plan.accept(new RangePartitionRewriter(plan));
// post pass the plan. this is the phase where the serialization and comparator code is set
postPasser.postPass(plan);
return plan;
}
Aggregations