use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class SharedWorkOptimizer method transform.
@Override
public ParseContext transform(ParseContext pctx) throws SemanticException {
final Map<String, TableScanOperator> topOps = pctx.getTopOps();
if (topOps.size() < 2) {
// Nothing to do, bail out
return pctx;
}
if (LOG.isDebugEnabled()) {
LOG.debug("Before SharedWorkOptimizer:\n" + Operator.toString(pctx.getTopOps().values()));
}
// Cache to use during optimization
SharedWorkOptimizerCache optimizerCache = new SharedWorkOptimizerCache();
// Gather information about the DPP table scans and store it in the cache
gatherDPPTableScanOps(pctx, optimizerCache);
// Map of dbName.TblName -> TSOperator
Multimap<String, TableScanOperator> tableNameToOps = splitTableScanOpsByTable(pctx);
// We enforce a certain order when we do the reutilization.
// In particular, we use size of table x number of reads to
// rank the tables.
List<Entry<String, Long>> sortedTables = rankTablesByAccumulatedSize(pctx);
LOG.debug("Sorted tables by size: {}", sortedTables);
// Execute optimization
Multimap<String, TableScanOperator> existingOps = ArrayListMultimap.create();
Set<Operator<?>> removedOps = new HashSet<>();
for (Entry<String, Long> tablePair : sortedTables) {
String tableName = tablePair.getKey();
for (TableScanOperator discardableTsOp : tableNameToOps.get(tableName)) {
if (removedOps.contains(discardableTsOp)) {
LOG.debug("Skip {} as it has already been removed", discardableTsOp);
continue;
}
Collection<TableScanOperator> prevTsOps = existingOps.get(tableName);
for (TableScanOperator retainableTsOp : prevTsOps) {
if (removedOps.contains(retainableTsOp)) {
LOG.debug("Skip {} as it has already been removed", retainableTsOp);
continue;
}
// First we quickly check if the two table scan operators can actually be merged
boolean mergeable = areMergeable(pctx, optimizerCache, retainableTsOp, discardableTsOp);
if (!mergeable) {
// Skip
LOG.debug("{} and {} cannot be merged", retainableTsOp, discardableTsOp);
continue;
}
// Secondly, we extract information about the part of the tree that can be merged
// as well as some structural information (memory consumption) that needs to be
// used to determined whether the merge can happen
SharedResult sr = extractSharedOptimizationInfoForRoot(pctx, optimizerCache, retainableTsOp, discardableTsOp);
// tables.
if (!validPreConditions(pctx, optimizerCache, sr)) {
// Skip
LOG.debug("{} and {} do not meet preconditions", retainableTsOp, discardableTsOp);
continue;
}
// We can merge
if (sr.retainableOps.size() > 1) {
// More than TS operator
Operator<?> lastRetainableOp = sr.retainableOps.get(sr.retainableOps.size() - 1);
Operator<?> lastDiscardableOp = sr.discardableOps.get(sr.discardableOps.size() - 1);
if (lastDiscardableOp.getNumChild() != 0) {
List<Operator<? extends OperatorDesc>> allChildren = Lists.newArrayList(lastDiscardableOp.getChildOperators());
for (Operator<? extends OperatorDesc> op : allChildren) {
lastDiscardableOp.getChildOperators().remove(op);
op.replaceParent(lastDiscardableOp, lastRetainableOp);
lastRetainableOp.getChildOperators().add(op);
}
}
LOG.debug("Merging subtree starting at {} into subtree starting at {}", discardableTsOp, retainableTsOp);
} else {
// Only TS operator
ExprNodeGenericFuncDesc exprNode = null;
if (retainableTsOp.getConf().getFilterExpr() != null) {
// Push filter on top of children
pushFilterToTopOfTableScan(optimizerCache, retainableTsOp);
// Clone to push to table scan
exprNode = (ExprNodeGenericFuncDesc) retainableTsOp.getConf().getFilterExpr();
}
if (discardableTsOp.getConf().getFilterExpr() != null) {
// Push filter on top
pushFilterToTopOfTableScan(optimizerCache, discardableTsOp);
ExprNodeGenericFuncDesc tsExprNode = discardableTsOp.getConf().getFilterExpr();
if (exprNode != null && !exprNode.isSame(tsExprNode)) {
// We merge filters from previous scan by ORing with filters from current scan
if (exprNode.getGenericUDF() instanceof GenericUDFOPOr) {
List<ExprNodeDesc> newChildren = new ArrayList<>(exprNode.getChildren().size() + 1);
for (ExprNodeDesc childExprNode : exprNode.getChildren()) {
if (childExprNode.isSame(tsExprNode)) {
// We do not need to do anything, it is in the OR expression
break;
}
newChildren.add(childExprNode);
}
if (exprNode.getChildren().size() == newChildren.size()) {
newChildren.add(tsExprNode);
exprNode = ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPOr(), newChildren);
}
} else {
exprNode = ExprNodeGenericFuncDesc.newInstance(new GenericUDFOPOr(), Arrays.<ExprNodeDesc>asList(exprNode, tsExprNode));
}
}
}
// Replace filter
retainableTsOp.getConf().setFilterExpr(exprNode);
// Replace table scan operator
List<Operator<? extends OperatorDesc>> allChildren = Lists.newArrayList(discardableTsOp.getChildOperators());
for (Operator<? extends OperatorDesc> op : allChildren) {
discardableTsOp.getChildOperators().remove(op);
op.replaceParent(discardableTsOp, retainableTsOp);
retainableTsOp.getChildOperators().add(op);
}
LOG.debug("Merging {} into {}", discardableTsOp, retainableTsOp);
}
// we are going to eliminate
for (Operator<?> op : sr.discardableInputOps) {
OperatorUtils.removeOperator(op);
optimizerCache.removeOp(op);
removedOps.add(op);
// Remove DPP predicates
if (op instanceof ReduceSinkOperator) {
SemiJoinBranchInfo sjbi = pctx.getRsToSemiJoinBranchInfo().get(op);
if (sjbi != null && !sr.discardableOps.contains(sjbi.getTsOp()) && !sr.discardableInputOps.contains(sjbi.getTsOp())) {
GenTezUtils.removeSemiJoinOperator(pctx, (ReduceSinkOperator) op, sjbi.getTsOp());
optimizerCache.tableScanToDPPSource.remove(sjbi.getTsOp(), op);
}
} else if (op instanceof AppMasterEventOperator) {
DynamicPruningEventDesc dped = (DynamicPruningEventDesc) op.getConf();
if (!sr.discardableOps.contains(dped.getTableScan()) && !sr.discardableInputOps.contains(dped.getTableScan())) {
GenTezUtils.removeSemiJoinOperator(pctx, (AppMasterEventOperator) op, dped.getTableScan());
optimizerCache.tableScanToDPPSource.remove(dped.getTableScan(), op);
}
}
LOG.debug("Input operator removed: {}", op);
}
// Then we merge the operators of the works we are going to merge
optimizerCache.removeOpAndCombineWork(discardableTsOp, retainableTsOp);
removedOps.add(discardableTsOp);
// Finally we remove the expression from the tree
for (Operator<?> op : sr.discardableOps) {
OperatorUtils.removeOperator(op);
optimizerCache.removeOp(op);
removedOps.add(op);
if (sr.discardableOps.size() == 1) {
// If there is a single discardable operator, it is a TableScanOperator
// and it means that we have merged filter expressions for it. Thus, we
// might need to remove DPP predicates from the retainable TableScanOperator
Collection<Operator<?>> c = optimizerCache.tableScanToDPPSource.get((TableScanOperator) op);
for (Operator<?> dppSource : c) {
if (dppSource instanceof ReduceSinkOperator) {
GenTezUtils.removeSemiJoinOperator(pctx, (ReduceSinkOperator) dppSource, (TableScanOperator) sr.retainableOps.get(0));
optimizerCache.tableScanToDPPSource.remove(sr.retainableOps.get(0), op);
} else if (dppSource instanceof AppMasterEventOperator) {
GenTezUtils.removeSemiJoinOperator(pctx, (AppMasterEventOperator) dppSource, (TableScanOperator) sr.retainableOps.get(0));
optimizerCache.tableScanToDPPSource.remove(sr.retainableOps.get(0), op);
}
}
}
LOG.debug("Operator removed: {}", op);
}
break;
}
if (removedOps.contains(discardableTsOp)) {
// This operator has been removed, remove it from the list of existing operators
existingOps.remove(tableName, discardableTsOp);
} else {
// This operator has not been removed, include it in the list of existing operators
existingOps.put(tableName, discardableTsOp);
}
}
}
// Remove unused table scan operators
Iterator<Entry<String, TableScanOperator>> it = topOps.entrySet().iterator();
while (it.hasNext()) {
Entry<String, TableScanOperator> e = it.next();
if (e.getValue().getNumChild() == 0) {
it.remove();
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("After SharedWorkOptimizer:\n" + Operator.toString(pctx.getTopOps().values()));
}
if (pctx.getConf().getBoolVar(ConfVars.HIVE_SHARED_WORK_EXTENDED_OPTIMIZATION)) {
// Gather RS operators that 1) belong to root works, i.e., works containing TS operators,
// and 2) share the same input operator.
// These will be the first target for extended shared work optimization
Multimap<Operator<?>, ReduceSinkOperator> parentToRsOps = ArrayListMultimap.create();
Set<Operator<?>> visited = new HashSet<>();
for (Entry<String, TableScanOperator> e : topOps.entrySet()) {
gatherReduceSinkOpsByInput(parentToRsOps, visited, findWorkOperators(optimizerCache, e.getValue()));
}
while (!parentToRsOps.isEmpty()) {
// As above, we enforce a certain order when we do the reutilization.
// In particular, we use size of data in RS x number of uses.
List<Entry<Operator<?>, Long>> sortedRSGroups = rankOpsByAccumulatedSize(parentToRsOps.keySet());
LOG.debug("Sorted operators by size: {}", sortedRSGroups);
// Execute extended optimization
// For each RS, check whether other RS in same work could be merge into this one.
// If they are merged, RS operators in the resulting work will be considered
// mergeable in next loop iteration.
Multimap<Operator<?>, ReduceSinkOperator> existingRsOps = ArrayListMultimap.create();
for (Entry<Operator<?>, Long> rsGroupInfo : sortedRSGroups) {
Operator<?> rsParent = rsGroupInfo.getKey();
for (ReduceSinkOperator discardableRsOp : parentToRsOps.get(rsParent)) {
if (removedOps.contains(discardableRsOp)) {
LOG.debug("Skip {} as it has already been removed", discardableRsOp);
continue;
}
Collection<ReduceSinkOperator> otherRsOps = existingRsOps.get(rsParent);
for (ReduceSinkOperator retainableRsOp : otherRsOps) {
if (removedOps.contains(retainableRsOp)) {
LOG.debug("Skip {} as it has already been removed", retainableRsOp);
continue;
}
// First we quickly check if the two RS operators can actually be merged.
// We already know that these two RS operators have the same parent, but
// we need to check whether both RS are actually equal. Further, we check
// whether their child is also equal. If any of these conditions are not
// met, we are not going to try to merge.
boolean mergeable = compareOperator(pctx, retainableRsOp, discardableRsOp) && compareOperator(pctx, retainableRsOp.getChildOperators().get(0), discardableRsOp.getChildOperators().get(0));
if (!mergeable) {
// Skip
LOG.debug("{} and {} cannot be merged", retainableRsOp, discardableRsOp);
continue;
}
LOG.debug("Checking additional conditions for merging subtree starting at {}" + " into subtree starting at {}", discardableRsOp, retainableRsOp);
// Secondly, we extract information about the part of the tree that can be merged
// as well as some structural information (memory consumption) that needs to be
// used to determined whether the merge can happen
Operator<?> retainableRsOpChild = retainableRsOp.getChildOperators().get(0);
Operator<?> discardableRsOpChild = discardableRsOp.getChildOperators().get(0);
SharedResult sr = extractSharedOptimizationInfo(pctx, optimizerCache, retainableRsOp, discardableRsOp, retainableRsOpChild, discardableRsOpChild);
// tables.
if (sr.retainableOps.isEmpty() || !validPreConditions(pctx, optimizerCache, sr)) {
// Skip
LOG.debug("{} and {} do not meet preconditions", retainableRsOp, discardableRsOp);
continue;
}
// We can merge
Operator<?> lastRetainableOp = sr.retainableOps.get(sr.retainableOps.size() - 1);
Operator<?> lastDiscardableOp = sr.discardableOps.get(sr.discardableOps.size() - 1);
if (lastDiscardableOp.getNumChild() != 0) {
List<Operator<? extends OperatorDesc>> allChildren = Lists.newArrayList(lastDiscardableOp.getChildOperators());
for (Operator<? extends OperatorDesc> op : allChildren) {
lastDiscardableOp.getChildOperators().remove(op);
op.replaceParent(lastDiscardableOp, lastRetainableOp);
lastRetainableOp.getChildOperators().add(op);
}
}
LOG.debug("Merging subtree starting at {} into subtree starting at {}", discardableRsOp, retainableRsOp);
// we are going to eliminate
for (Operator<?> op : sr.discardableInputOps) {
OperatorUtils.removeOperator(op);
optimizerCache.removeOp(op);
removedOps.add(op);
// Remove DPP predicates
if (op instanceof ReduceSinkOperator) {
SemiJoinBranchInfo sjbi = pctx.getRsToSemiJoinBranchInfo().get(op);
if (sjbi != null && !sr.discardableOps.contains(sjbi.getTsOp()) && !sr.discardableInputOps.contains(sjbi.getTsOp())) {
GenTezUtils.removeSemiJoinOperator(pctx, (ReduceSinkOperator) op, sjbi.getTsOp());
optimizerCache.tableScanToDPPSource.remove(sjbi.getTsOp(), op);
}
} else if (op instanceof AppMasterEventOperator) {
DynamicPruningEventDesc dped = (DynamicPruningEventDesc) op.getConf();
if (!sr.discardableOps.contains(dped.getTableScan()) && !sr.discardableInputOps.contains(dped.getTableScan())) {
GenTezUtils.removeSemiJoinOperator(pctx, (AppMasterEventOperator) op, dped.getTableScan());
optimizerCache.tableScanToDPPSource.remove(dped.getTableScan(), op);
}
}
LOG.debug("Input operator removed: {}", op);
}
// We remove the discardable RS operator
OperatorUtils.removeOperator(discardableRsOp);
optimizerCache.removeOp(discardableRsOp);
removedOps.add(discardableRsOp);
LOG.debug("Operator removed: {}", discardableRsOp);
// Then we merge the operators of the works we are going to merge
optimizerCache.removeOpAndCombineWork(discardableRsOpChild, retainableRsOpChild);
// Finally we remove the rest of the expression from the tree
for (Operator<?> op : sr.discardableOps) {
OperatorUtils.removeOperator(op);
optimizerCache.removeOp(op);
removedOps.add(op);
LOG.debug("Operator removed: {}", op);
}
break;
}
if (removedOps.contains(discardableRsOp)) {
// This operator has been removed, remove it from the list of existing operators
existingRsOps.remove(rsParent, discardableRsOp);
} else {
// This operator has not been removed, include it in the list of existing operators
existingRsOps.put(rsParent, discardableRsOp);
}
}
}
// We gather the operators that will be used for next iteration of extended optimization
// (if any)
parentToRsOps = ArrayListMultimap.create();
visited = new HashSet<>();
for (Entry<Operator<?>, ReduceSinkOperator> e : existingRsOps.entries()) {
if (removedOps.contains(e.getValue()) || e.getValue().getNumChild() < 1) {
// semijoin RS), we can quickly skip this one
continue;
}
gatherReduceSinkOpsByInput(parentToRsOps, visited, findWorkOperators(optimizerCache, e.getValue().getChildOperators().get(0)));
}
}
// Remove unused table scan operators
it = topOps.entrySet().iterator();
while (it.hasNext()) {
Entry<String, TableScanOperator> e = it.next();
if (e.getValue().getNumChild() == 0) {
it.remove();
}
}
if (LOG.isDebugEnabled()) {
LOG.debug("After SharedWorkExtendedOptimizer:\n" + Operator.toString(pctx.getTopOps().values()));
}
}
// we use the basic or the extended version of the optimizer.
if (pctx.getConf().getBoolVar(ConfVars.HIVE_IN_TEST)) {
Set<Operator<?>> visited = new HashSet<>();
it = topOps.entrySet().iterator();
while (it.hasNext()) {
Entry<String, TableScanOperator> e = it.next();
for (Operator<?> op : OperatorUtils.findOperators(e.getValue(), Operator.class)) {
if (!visited.contains(op)) {
if (!findWorkOperators(optimizerCache, op).equals(findWorkOperators(op, new HashSet<Operator<?>>()))) {
throw new SemanticException("Error in shared work optimizer: operator cache contents" + "and actual plan differ");
}
visited.add(op);
}
}
}
}
return pctx;
}
use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class SharedWorkOptimizer method findChildWorkOperators.
private static Set<Operator<?>> findChildWorkOperators(ParseContext pctx, SharedWorkOptimizerCache optimizerCache, Operator<?> start) {
// Find operators in work
Set<Operator<?>> workOps = findWorkOperators(optimizerCache, start);
// Gather output works operators
Set<Operator<?>> set = new HashSet<Operator<?>>();
for (Operator<?> op : workOps) {
if (op instanceof ReduceSinkOperator) {
if (op.getChildOperators() != null) {
// All children of RS are descendants
for (Operator<?> child : op.getChildOperators()) {
set.addAll(findWorkOperators(optimizerCache, child));
}
}
// Semijoin DPP work is considered a child because work needs
// to finish for it to execute
SemiJoinBranchInfo sjbi = pctx.getRsToSemiJoinBranchInfo().get(op);
if (sjbi != null) {
set.addAll(findWorkOperators(optimizerCache, sjbi.getTsOp()));
}
} else if (op.getConf() instanceof DynamicPruningEventDesc) {
// DPP work is considered a child because work needs
// to finish for it to execute
set.addAll(findWorkOperators(optimizerCache, ((DynamicPruningEventDesc) op.getConf()).getTableScan()));
}
}
return set;
}
use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class SparkTask method getOperatorCounters.
private Map<String, List<String>> getOperatorCounters() {
String groupName = HiveConf.getVar(conf, HiveConf.ConfVars.HIVECOUNTERGROUP);
Map<String, List<String>> counters = new HashMap<String, List<String>>();
List<String> hiveCounters = new LinkedList<String>();
counters.put(groupName, hiveCounters);
hiveCounters.add(Operator.HIVE_COUNTER_CREATED_FILES);
// Spark transformation and Hive operators in SparkWork.
for (MapOperator.Counter counter : MapOperator.Counter.values()) {
hiveCounters.add(counter.toString());
}
SparkWork sparkWork = this.getWork();
for (BaseWork work : sparkWork.getAllWork()) {
for (Operator<? extends OperatorDesc> operator : work.getAllOperators()) {
if (operator instanceof FileSinkOperator) {
for (FileSinkOperator.Counter counter : FileSinkOperator.Counter.values()) {
hiveCounters.add(((FileSinkOperator) operator).getCounterName(counter));
}
} else if (operator instanceof ReduceSinkOperator) {
final String contextName = conf.get(Operator.CONTEXT_NAME_KEY, "");
for (ReduceSinkOperator.Counter counter : ReduceSinkOperator.Counter.values()) {
hiveCounters.add(Utilities.getVertexCounterName(counter.name(), contextName));
}
} else if (operator instanceof ScriptOperator) {
for (ScriptOperator.Counter counter : ScriptOperator.Counter.values()) {
hiveCounters.add(counter.toString());
}
} else if (operator instanceof JoinOperator) {
for (JoinOperator.SkewkeyTableCounter counter : JoinOperator.SkewkeyTableCounter.values()) {
hiveCounters.add(counter.toString());
}
}
}
}
return counters;
}
use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class SemanticAnalyzer method genCommonGroupByPlanReduceSinkOperator.
@SuppressWarnings("nls")
private ReduceSinkOperator genCommonGroupByPlanReduceSinkOperator(QB qb, List<String> dests, Operator inputOperatorInfo) throws SemanticException {
RowResolver reduceSinkInputRowResolver = opParseCtx.get(inputOperatorInfo).getRowResolver();
QBParseInfo parseInfo = qb.getParseInfo();
RowResolver reduceSinkOutputRowResolver = new RowResolver();
reduceSinkOutputRowResolver.setIsExprResolver(true);
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
// The group by keys and distinct keys should be the same for all dests, so using the first
// one to produce these will be the same as using any other.
String dest = dests.get(0);
// Pre-compute group-by keys and store in reduceKeys
List<String> outputKeyColumnNames = new ArrayList<String>();
List<String> outputValueColumnNames = new ArrayList<String>();
List<ASTNode> grpByExprs = getGroupByForClause(parseInfo, dest);
ArrayList<ExprNodeDesc> reduceKeys = getReduceKeysForReduceSink(grpByExprs, dest, reduceSinkInputRowResolver, reduceSinkOutputRowResolver, outputKeyColumnNames, colExprMap);
int keyLength = reduceKeys.size();
List<List<Integer>> distinctColIndices = getDistinctColIndicesForReduceSink(parseInfo, dest, reduceKeys, reduceSinkInputRowResolver, reduceSinkOutputRowResolver, outputKeyColumnNames, colExprMap);
ArrayList<ExprNodeDesc> reduceValues = new ArrayList<ExprNodeDesc>();
// them
for (String destination : dests) {
getReduceValuesForReduceSinkNoMapAgg(parseInfo, destination, reduceSinkInputRowResolver, reduceSinkOutputRowResolver, outputValueColumnNames, reduceValues, colExprMap);
// Need to pass all of the columns used in the where clauses as reduce values
ASTNode whereClause = parseInfo.getWhrForClause(destination);
if (whereClause != null) {
assert whereClause.getChildCount() == 1;
ASTNode predicates = (ASTNode) whereClause.getChild(0);
Map<ASTNode, ExprNodeDesc> nodeOutputs = genAllExprNodeDesc(predicates, reduceSinkInputRowResolver);
removeMappingForKeys(predicates, nodeOutputs, reduceKeys);
// extract columns missing in current RS key/value
for (Map.Entry<ASTNode, ExprNodeDesc> entry : nodeOutputs.entrySet()) {
ASTNode parameter = entry.getKey();
ExprNodeDesc expression = entry.getValue();
if (!(expression instanceof ExprNodeColumnDesc)) {
continue;
}
if (ExprNodeDescUtils.indexOf(expression, reduceValues) >= 0) {
continue;
}
String internalName = getColumnInternalName(reduceValues.size());
String field = Utilities.ReduceField.VALUE.toString() + "." + internalName;
reduceValues.add(expression);
outputValueColumnNames.add(internalName);
reduceSinkOutputRowResolver.putExpression(parameter, new ColumnInfo(field, expression.getTypeInfo(), null, false));
colExprMap.put(field, expression);
}
}
}
// Optimize the scenario when there are no grouping keys - only 1 reducer is needed
int numReducers = -1;
if (grpByExprs.isEmpty()) {
numReducers = 1;
}
ReduceSinkDesc rsDesc = PlanUtils.getReduceSinkDesc(reduceKeys, keyLength, reduceValues, distinctColIndices, outputKeyColumnNames, outputValueColumnNames, true, -1, keyLength, numReducers, AcidUtils.Operation.NOT_ACID);
ReduceSinkOperator rsOp = (ReduceSinkOperator) putOpInsertMap(OperatorFactory.getAndMakeChild(rsDesc, new RowSchema(reduceSinkOutputRowResolver.getColumnInfos()), inputOperatorInfo), reduceSinkOutputRowResolver);
rsOp.setColumnExprMap(colExprMap);
return rsOp;
}
use of org.apache.hadoop.hive.ql.exec.ReduceSinkOperator in project hive by apache.
the class SemanticAnalyzer method genGroupByPlan1MR.
/**
* Generate a Group-By plan using a single map-reduce job (3 operators will be
* inserted):
*
* ReduceSink ( keys = (K1_EXP, K2_EXP, DISTINCT_EXP), values = (A1_EXP,
* A2_EXP) ) SortGroupBy (keys = (KEY.0,KEY.1), aggregations =
* (count_distinct(KEY.2), sum(VALUE.0), count(VALUE.1))) Select (final
* selects).
*
* @param dest
* @param qb
* @param input
* @return
* @throws SemanticException
*
* Generate a Group-By plan using 1 map-reduce job. Spray by the
* group by key, and sort by the distinct key (if any), and compute
* aggregates * The aggregation evaluation functions are as
* follows: Partitioning Key: grouping key
*
* Sorting Key: grouping key if no DISTINCT grouping + distinct key
* if DISTINCT
*
* Reducer: iterate/merge (mode = COMPLETE)
*/
@SuppressWarnings({ "nls" })
private Operator genGroupByPlan1MR(String dest, QB qb, Operator input) throws SemanticException {
QBParseInfo parseInfo = qb.getParseInfo();
int numReducers = -1;
ObjectPair<List<ASTNode>, List<Long>> grpByExprsGroupingSets = getGroupByGroupingSetsForClause(parseInfo, dest);
List<ASTNode> grpByExprs = grpByExprsGroupingSets.getFirst();
List<Long> groupingSets = grpByExprsGroupingSets.getSecond();
if (grpByExprs.isEmpty()) {
numReducers = 1;
}
// Grouping sets are not allowed
if (!groupingSets.isEmpty()) {
throw new SemanticException(ErrorMsg.HIVE_GROUPING_SETS_AGGR_NOMAPAGGR.getMsg());
}
// ////// 1. Generate ReduceSinkOperator
ReduceSinkOperator reduceSinkOperatorInfo = genGroupByPlanReduceSinkOperator(qb, dest, input, grpByExprs, grpByExprs.size(), false, numReducers, false, false);
// ////// 2. Generate GroupbyOperator
Operator groupByOperatorInfo = genGroupByPlanGroupByOperator(parseInfo, dest, reduceSinkOperatorInfo, reduceSinkOperatorInfo, GroupByDesc.Mode.COMPLETE, null);
return groupByOperatorInfo;
}
Aggregations