Search in sources :

Example 11 with SparkEdgeProperty

use of org.apache.hadoop.hive.ql.plan.SparkEdgeProperty in project hive by apache.

the class SplitSparkWorkResolver method splitBaseWork.

// Split work into multiple branches, one for each childWork in childWorks.
// It also set up the connection between each parent work and child work.
private void splitBaseWork(SparkWork sparkWork, BaseWork parentWork, List<BaseWork> childWorks) {
    if (getAllReduceSinks(parentWork).size() <= 1) {
        return;
    }
    // Grand-parent works - we need to set these to be the parents of the cloned works.
    List<BaseWork> grandParentWorks = sparkWork.getParents(parentWork);
    boolean isFirst = true;
    for (BaseWork childWork : childWorks) {
        BaseWork clonedParentWork = SerializationUtilities.cloneBaseWork(parentWork);
        // give the cloned work a different name
        clonedParentWork.setName(clonedParentWork.getName().replaceAll("^([a-zA-Z]+)(\\s+)(\\d+)", "$1$2" + GenSparkUtils.getUtils().getNextSeqNumber()));
        setStatistics(parentWork, clonedParentWork);
        String childReducerName = childWork.getName();
        SparkEdgeProperty clonedEdgeProperty = sparkWork.getEdgeProperty(parentWork, childWork);
        // the corresponding ReduceSinkOperator.
        for (Operator<?> op : clonedParentWork.getAllLeafOperators()) {
            if (op instanceof ReduceSinkOperator) {
                if (!((ReduceSinkOperator) op).getConf().getOutputName().equals(childReducerName)) {
                    removeOpRecursive(op);
                }
            } else if (!isFirst) {
                removeOpRecursive(op);
            }
        }
        isFirst = false;
        // Then, we need to set up the graph connection. Especially:
        // 1, we need to connect this cloned parent work with all the grand-parent works.
        // 2, we need to connect this cloned parent work with the corresponding child work.
        sparkWork.add(clonedParentWork);
        for (BaseWork gpw : grandParentWorks) {
            sparkWork.connect(gpw, clonedParentWork, sparkWork.getEdgeProperty(gpw, parentWork));
        }
        sparkWork.connect(clonedParentWork, childWork, clonedEdgeProperty);
        sparkWork.getCloneToWork().put(clonedParentWork, parentWork);
    }
    sparkWork.remove(parentWork);
}
Also used : SparkEdgeProperty(org.apache.hadoop.hive.ql.plan.SparkEdgeProperty) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Aggregations

SparkEdgeProperty (org.apache.hadoop.hive.ql.plan.SparkEdgeProperty)11 BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)8 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)5 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)5 Operator (org.apache.hadoop.hive.ql.exec.Operator)5 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)5 SparkWork (org.apache.hadoop.hive.ql.plan.SparkWork)5 ArrayList (java.util.ArrayList)4 List (java.util.List)4 HashTableDummyOperator (org.apache.hadoop.hive.ql.exec.HashTableDummyOperator)4 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)4 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)4 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)4 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)4 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)4 Path (org.apache.hadoop.fs.Path)3 HiveConf (org.apache.hadoop.hive.conf.HiveConf)3 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)3 SparkHashTableSinkOperator (org.apache.hadoop.hive.ql.exec.SparkHashTableSinkOperator)3 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)3