use of org.apache.hadoop.hive.ql.plan.SparkEdgeProperty in project hive by apache.
the class SplitSparkWorkResolver method splitBaseWork.
// Split work into multiple branches, one for each childWork in childWorks.
// It also set up the connection between each parent work and child work.
private void splitBaseWork(SparkWork sparkWork, BaseWork parentWork, List<BaseWork> childWorks) {
if (getAllReduceSinks(parentWork).size() <= 1) {
return;
}
// Grand-parent works - we need to set these to be the parents of the cloned works.
List<BaseWork> grandParentWorks = sparkWork.getParents(parentWork);
boolean isFirst = true;
for (BaseWork childWork : childWorks) {
BaseWork clonedParentWork = SerializationUtilities.cloneBaseWork(parentWork);
// give the cloned work a different name
clonedParentWork.setName(clonedParentWork.getName().replaceAll("^([a-zA-Z]+)(\\s+)(\\d+)", "$1$2" + GenSparkUtils.getUtils().getNextSeqNumber()));
setStatistics(parentWork, clonedParentWork);
String childReducerName = childWork.getName();
SparkEdgeProperty clonedEdgeProperty = sparkWork.getEdgeProperty(parentWork, childWork);
// the corresponding ReduceSinkOperator.
for (Operator<?> op : clonedParentWork.getAllLeafOperators()) {
if (op instanceof ReduceSinkOperator) {
if (!((ReduceSinkOperator) op).getConf().getOutputName().equals(childReducerName)) {
removeOpRecursive(op);
}
} else if (!isFirst) {
removeOpRecursive(op);
}
}
isFirst = false;
// Then, we need to set up the graph connection. Especially:
// 1, we need to connect this cloned parent work with all the grand-parent works.
// 2, we need to connect this cloned parent work with the corresponding child work.
sparkWork.add(clonedParentWork);
for (BaseWork gpw : grandParentWorks) {
sparkWork.connect(gpw, clonedParentWork, sparkWork.getEdgeProperty(gpw, parentWork));
}
sparkWork.connect(clonedParentWork, childWork, clonedEdgeProperty);
sparkWork.getCloneToWork().put(clonedParentWork, parentWork);
}
sparkWork.remove(parentWork);
}
Aggregations