use of org.apache.hadoop.hive.ql.exec.JoinOperator in project hive by apache.
the class GenSparkUtils method getEdgeProperty.
public static SparkEdgeProperty getEdgeProperty(HiveConf conf, ReduceSinkOperator reduceSink, ReduceWork reduceWork) throws SemanticException {
boolean useSparkGroupBy = conf.getBoolVar(HiveConf.ConfVars.SPARK_USE_GROUPBY_SHUFFLE);
SparkEdgeProperty edgeProperty = new SparkEdgeProperty(SparkEdgeProperty.SHUFFLE_NONE);
edgeProperty.setNumPartitions(reduceWork.getNumReduceTasks());
String sortOrder = Strings.nullToEmpty(reduceSink.getConf().getOrder()).trim();
if (hasGBYOperator(reduceSink)) {
edgeProperty.setShuffleGroup();
// SHUFFLE_SORT shouldn't be used for this purpose, see HIVE-8542
if (!useSparkGroupBy || (!sortOrder.isEmpty() && groupByNeedParLevelOrder(reduceSink))) {
if (!useSparkGroupBy) {
LOG.info("hive.spark.use.groupby.shuffle is off. Use repartition shuffle instead.");
}
edgeProperty.setMRShuffle();
}
}
if (reduceWork.getReducer() instanceof JoinOperator) {
// reduce-side join, use MR-style shuffle
edgeProperty.setMRShuffle();
}
// If its a FileSink to bucketed files, also use MR-style shuffle to
// get compatible taskId for bucket-name
FileSinkOperator fso = getChildOperator(reduceWork.getReducer(), FileSinkOperator.class);
if (fso != null) {
String bucketCount = fso.getConf().getTableInfo().getProperties().getProperty(hive_metastoreConstants.BUCKET_COUNT);
if (bucketCount != null && Integer.parseInt(bucketCount) > 1) {
edgeProperty.setMRShuffle();
}
}
// test if we need partition/global order, SHUFFLE_SORT should only be used for global order
if (edgeProperty.isShuffleNone() && !sortOrder.isEmpty()) {
if ((reduceSink.getConf().getPartitionCols() == null || reduceSink.getConf().getPartitionCols().isEmpty() || isSame(reduceSink.getConf().getPartitionCols(), reduceSink.getConf().getKeyCols())) && reduceSink.getConf().hasOrderBy()) {
edgeProperty.setShuffleSort();
} else {
edgeProperty.setMRShuffle();
}
}
// simple distribute-by goes here
if (edgeProperty.isShuffleNone()) {
if (!useSparkGroupBy) {
LOG.info("hive.spark.use.groupby.shuffle is off. Use repartition shuffle instead.");
edgeProperty.setMRShuffle();
} else {
edgeProperty.setShuffleGroup();
}
}
return edgeProperty;
}
Aggregations