Search in sources :

Example 36 with JoinOperator

use of org.apache.hadoop.hive.ql.exec.JoinOperator in project hive by apache.

the class GenSparkUtils method getEdgeProperty.

public static SparkEdgeProperty getEdgeProperty(HiveConf conf, ReduceSinkOperator reduceSink, ReduceWork reduceWork) throws SemanticException {
    boolean useSparkGroupBy = conf.getBoolVar(HiveConf.ConfVars.SPARK_USE_GROUPBY_SHUFFLE);
    SparkEdgeProperty edgeProperty = new SparkEdgeProperty(SparkEdgeProperty.SHUFFLE_NONE);
    edgeProperty.setNumPartitions(reduceWork.getNumReduceTasks());
    String sortOrder = Strings.nullToEmpty(reduceSink.getConf().getOrder()).trim();
    if (hasGBYOperator(reduceSink)) {
        edgeProperty.setShuffleGroup();
        // SHUFFLE_SORT shouldn't be used for this purpose, see HIVE-8542
        if (!useSparkGroupBy || (!sortOrder.isEmpty() && groupByNeedParLevelOrder(reduceSink))) {
            if (!useSparkGroupBy) {
                LOG.info("hive.spark.use.groupby.shuffle is off. Use repartition shuffle instead.");
            }
            edgeProperty.setMRShuffle();
        }
    }
    if (reduceWork.getReducer() instanceof JoinOperator) {
        // reduce-side join, use MR-style shuffle
        edgeProperty.setMRShuffle();
    }
    // If its a FileSink to bucketed files, also use MR-style shuffle to
    // get compatible taskId for bucket-name
    FileSinkOperator fso = getChildOperator(reduceWork.getReducer(), FileSinkOperator.class);
    if (fso != null) {
        String bucketCount = fso.getConf().getTableInfo().getProperties().getProperty(hive_metastoreConstants.BUCKET_COUNT);
        if (bucketCount != null && Integer.parseInt(bucketCount) > 1) {
            edgeProperty.setMRShuffle();
        }
    }
    // test if we need partition/global order, SHUFFLE_SORT should only be used for global order
    if (edgeProperty.isShuffleNone() && !sortOrder.isEmpty()) {
        if ((reduceSink.getConf().getPartitionCols() == null || reduceSink.getConf().getPartitionCols().isEmpty() || isSame(reduceSink.getConf().getPartitionCols(), reduceSink.getConf().getKeyCols())) && reduceSink.getConf().hasOrderBy()) {
            edgeProperty.setShuffleSort();
        } else {
            edgeProperty.setMRShuffle();
        }
    }
    // simple distribute-by goes here
    if (edgeProperty.isShuffleNone()) {
        if (!useSparkGroupBy) {
            LOG.info("hive.spark.use.groupby.shuffle is off. Use repartition shuffle instead.");
            edgeProperty.setMRShuffle();
        } else {
            edgeProperty.setShuffleGroup();
        }
    }
    return edgeProperty;
}
Also used : SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SparkEdgeProperty(org.apache.hadoop.hive.ql.plan.SparkEdgeProperty)

Aggregations

JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)36 Operator (org.apache.hadoop.hive.ql.exec.Operator)20 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)18 ArrayList (java.util.ArrayList)16 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)16 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)16 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)14 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)14 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)11 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)11 HashMap (java.util.HashMap)10 AbstractMapJoinOperator (org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator)10 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)10 List (java.util.List)9 JoinDesc (org.apache.hadoop.hive.ql.plan.JoinDesc)9 Path (org.apache.hadoop.fs.Path)8 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)8 HashSet (java.util.HashSet)7 LinkedHashMap (java.util.LinkedHashMap)7 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)7