use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class SemanticAnalyzer method genJoinOperatorChildren.
private Operator genJoinOperatorChildren(QBJoinTree join, Operator left, Operator[] right, Set<Integer> omitOpts, ExprNodeDesc[][] joinKeys) throws SemanticException {
RowResolver outputRR = new RowResolver();
List<String> outputColumnNames = new ArrayList<String>();
// all children are base classes
Operator<?>[] rightOps = new Operator[right.length];
Map<String, Byte> reversedExprs = new HashMap<String, Byte>();
Map<Byte, List<ExprNodeDesc>> exprMap = new HashMap<Byte, List<ExprNodeDesc>>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
Map<Integer, Set<String>> posToAliasMap = new HashMap<Integer, Set<String>>();
Map<Byte, List<ExprNodeDesc>> filterMap = new HashMap<Byte, List<ExprNodeDesc>>();
// Only used for semijoin with residual predicates
List<ColumnInfo> topSelectInputColumns = new ArrayList<>();
for (int pos = 0; pos < right.length; ++pos) {
Operator<?> input = right[pos] == null ? left : right[pos];
if (input == null) {
input = left;
}
ReduceSinkOperator rs = (ReduceSinkOperator) input;
if (rs.getNumParent() != 1) {
throw new SemanticException("RS should have single parent");
}
Operator<?> parent = rs.getParentOperators().get(0);
ReduceSinkDesc rsDesc = (ReduceSinkDesc) (input.getConf());
int[] index = rs.getValueIndex();
List<ExprNodeDesc> valueDesc = new ArrayList<ExprNodeDesc>();
List<ExprNodeDesc> filterDesc = new ArrayList<ExprNodeDesc>();
Byte tag = (byte) rsDesc.getTag();
// we will add a Select on top of the join
if (omitOpts != null && omitOpts.contains(pos) && join.getPostJoinFilters().size() == 0) {
exprMap.put(tag, valueDesc);
filterMap.put(tag, filterDesc);
rightOps[pos] = input;
continue;
}
List<String> keyColNames = rsDesc.getOutputKeyColumnNames();
List<String> valColNames = rsDesc.getOutputValueColumnNames();
// prepare output descriptors for the input opt
RowResolver inputRR = opParseCtx.get(input).getRowResolver();
RowResolver parentRR = opParseCtx.get(parent).getRowResolver();
posToAliasMap.put(pos, new HashSet<String>(inputRR.getTableNames()));
List<ColumnInfo> columns = parentRR.getColumnInfos();
for (int i = 0; i < index.length; i++) {
ColumnInfo prev = columns.get(i);
String[] nm = parentRR.reverseLookup(prev.getInternalName());
String[] nm2 = parentRR.getAlternateMappings(prev.getInternalName());
if (outputRR.get(nm[0], nm[1]) != null) {
continue;
}
ColumnInfo info = new ColumnInfo(prev);
String field;
if (index[i] >= 0) {
field = Utilities.ReduceField.KEY + "." + keyColNames.get(index[i]);
} else {
field = Utilities.ReduceField.VALUE + "." + valColNames.get(-index[i] - 1);
}
String internalName = getColumnInternalName(outputColumnNames.size());
ExprNodeColumnDesc desc = new ExprNodeColumnDesc(info.getType(), field, info.getTabAlias(), info.getIsVirtualCol());
info.setInternalName(internalName);
colExprMap.put(internalName, desc);
outputRR.put(nm[0], nm[1], info);
if (nm2 != null) {
outputRR.addMappingOnly(nm2[0], nm2[1], info);
}
valueDesc.add(desc);
outputColumnNames.add(internalName);
reversedExprs.put(internalName, tag);
// Populate semijoin select if needed
if (omitOpts == null || !omitOpts.contains(pos)) {
topSelectInputColumns.add(info);
}
}
for (ASTNode cond : join.getFilters().get(tag)) {
filterDesc.add(genExprNodeDesc(cond, inputRR));
}
exprMap.put(tag, valueDesc);
filterMap.put(tag, filterDesc);
rightOps[pos] = input;
}
JoinCondDesc[] joinCondns = new JoinCondDesc[join.getJoinCond().length];
for (int i = 0; i < join.getJoinCond().length; i++) {
JoinCond condn = join.getJoinCond()[i];
joinCondns[i] = new JoinCondDesc(condn);
}
JoinDesc desc = new JoinDesc(exprMap, outputColumnNames, join.getNoOuterJoin(), joinCondns, filterMap, joinKeys, null);
desc.setReversedExprs(reversedExprs);
desc.setFilterMap(join.getFilterMap());
// Add filters that apply to more than one input
if (join.getPostJoinFilters().size() != 0 && (!join.getNoOuterJoin() || !join.getNoSemiJoin() || HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_PUSH_RESIDUAL_INNER))) {
LOG.debug("Generate JOIN with post-filtering conditions");
List<ExprNodeDesc> residualFilterExprs = new ArrayList<ExprNodeDesc>();
for (ASTNode cond : join.getPostJoinFilters()) {
residualFilterExprs.add(genExprNodeDesc(cond, outputRR, false, isCBOExecuted()));
}
desc.setResidualFilterExprs(residualFilterExprs);
// Clean post-conditions
join.getPostJoinFilters().clear();
}
JoinOperator joinOp = (JoinOperator) OperatorFactory.getAndMakeChild(getOpContext(), desc, new RowSchema(outputRR.getColumnInfos()), rightOps);
joinOp.setColumnExprMap(colExprMap);
joinOp.setPosToAliasMap(posToAliasMap);
if (join.getNullSafes() != null) {
boolean[] nullsafes = new boolean[join.getNullSafes().size()];
for (int i = 0; i < nullsafes.length; i++) {
nullsafes[i] = join.getNullSafes().get(i);
}
desc.setNullSafes(nullsafes);
}
Operator<?> topOp = putOpInsertMap(joinOp, outputRR);
if (omitOpts != null && !omitOpts.isEmpty() && desc.getResidualFilterExprs() != null && !desc.getResidualFilterExprs().isEmpty()) {
// Adding a select operator to top of semijoin to ensure projection of only correct columns
final List<ExprNodeDesc> topSelectExprs = new ArrayList<>();
final List<String> topSelectOutputColNames = new ArrayList<>();
final RowResolver topSelectRR = new RowResolver();
final Map<String, ExprNodeDesc> topSelectColExprMap = new HashMap<String, ExprNodeDesc>();
for (ColumnInfo colInfo : topSelectInputColumns) {
ExprNodeColumnDesc columnExpr = new ExprNodeColumnDesc(colInfo);
topSelectExprs.add(columnExpr);
topSelectOutputColNames.add(colInfo.getInternalName());
topSelectColExprMap.put(colInfo.getInternalName(), columnExpr);
String[] nm = outputRR.reverseLookup(columnExpr.getColumn());
String[] nm2 = outputRR.getAlternateMappings(columnExpr.getColumn());
topSelectRR.put(nm[0], nm[1], colInfo);
if (nm2 != null) {
topSelectRR.addMappingOnly(nm2[0], nm2[1], colInfo);
}
}
final SelectDesc topSelect = new SelectDesc(topSelectExprs, topSelectOutputColNames);
topOp = putOpInsertMap(OperatorFactory.getAndMakeChild(topSelect, new RowSchema(topSelectRR.getColumnInfos()), topOp), topSelectRR);
topOp.setColumnExprMap(topSelectColExprMap);
}
return topOp;
}
use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class SemanticAnalyzer method genGroupByPlanGroupByOperator1.
/**
* Generate the GroupByOperator for the Query Block (parseInfo.getXXX(dest)).
* The new GroupByOperator will be a child of the reduceSinkOperatorInfo.
*
* @param parseInfo
* @param dest
* @param reduceSinkOperatorInfo
* @param mode
* The mode of the aggregation (MERGEPARTIAL, PARTIAL2)
* @param genericUDAFEvaluators
* The mapping from Aggregation StringTree to the
* genericUDAFEvaluator.
* @param groupingSets
* list of grouping sets
* @param groupingSetsPresent
* whether grouping sets are present in this query
* @param groupingSetsNeedAdditionalMRJob
* whether grouping sets are consumed by this group by
* @return the new GroupByOperator
*/
@SuppressWarnings("nls")
private Operator genGroupByPlanGroupByOperator1(QBParseInfo parseInfo, String dest, Operator reduceSinkOperatorInfo, GroupByDesc.Mode mode, Map<String, GenericUDAFEvaluator> genericUDAFEvaluators, List<Long> groupingSets, boolean groupingSetsPresent, boolean groupingSetsNeedAdditionalMRJob) throws SemanticException {
List<String> outputColumnNames = new ArrayList<String>();
RowResolver groupByInputRowResolver = opParseCtx.get(reduceSinkOperatorInfo).getRowResolver();
RowResolver groupByOutputRowResolver = new RowResolver();
groupByOutputRowResolver.setIsExprResolver(true);
List<ExprNodeDesc> groupByKeys = new ArrayList<ExprNodeDesc>();
List<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
List<ASTNode> grpByExprs = getGroupByForClause(parseInfo, dest);
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
for (int i = 0; i < grpByExprs.size(); ++i) {
ASTNode grpbyExpr = grpByExprs.get(i);
ColumnInfo exprInfo = groupByInputRowResolver.getExpression(grpbyExpr);
if (exprInfo == null) {
throw new SemanticException(ASTErrorUtils.getMsg(ErrorMsg.INVALID_COLUMN.getMsg(), grpbyExpr));
}
groupByKeys.add(new ExprNodeColumnDesc(exprInfo));
String field = getColumnInternalName(i);
outputColumnNames.add(field);
ColumnInfo oColInfo = new ColumnInfo(field, exprInfo.getType(), "", false);
groupByOutputRowResolver.putExpression(grpbyExpr, oColInfo);
addAlternateGByKeyMappings(grpbyExpr, oColInfo, reduceSinkOperatorInfo, groupByOutputRowResolver);
colExprMap.put(field, groupByKeys.get(groupByKeys.size() - 1));
}
// This is only needed if a new grouping set key is being created
int groupingSetsPosition = -1;
// For grouping sets, add a dummy grouping key
if (groupingSetsPresent) {
groupingSetsPosition = groupByKeys.size();
// This function is called for GroupBy2 to add grouping id as part of the groupby keys
if (!groupingSetsNeedAdditionalMRJob) {
addGroupingSetKey(groupByKeys, groupByInputRowResolver, groupByOutputRowResolver, outputColumnNames, colExprMap);
} else {
// The grouping set has not yet been processed. Create a new grouping key
// Consider the query: select a,b, count(1) from T group by a,b with cube;
// where it is being executed in 2 map-reduce jobs
// The plan for 1st MR is TableScan -> GroupBy1 -> ReduceSink -> GroupBy2 -> FileSink
// GroupBy1/ReduceSink worked as if grouping sets were not present
// This function is called for GroupBy2 to create new rows for grouping sets
// For each input row (a,b), 4 rows are created for the example above:
// (a,b), (a,null), (null, b), (null, null)
createNewGroupingKey(groupByKeys, outputColumnNames, groupByOutputRowResolver, colExprMap);
}
}
Map<String, ASTNode> aggregationTrees = parseInfo.getAggregationExprsForClause(dest);
// get the last colName for the reduce KEY
// it represents the column name corresponding to distinct aggr, if any
String lastKeyColName = null;
List<ExprNodeDesc> reduceValues = null;
if (reduceSinkOperatorInfo.getConf() instanceof ReduceSinkDesc) {
List<String> inputKeyCols = ((ReduceSinkDesc) reduceSinkOperatorInfo.getConf()).getOutputKeyColumnNames();
if (inputKeyCols.size() > 0) {
lastKeyColName = inputKeyCols.get(inputKeyCols.size() - 1);
}
reduceValues = ((ReduceSinkDesc) reduceSinkOperatorInfo.getConf()).getValueCols();
}
int numDistinctUDFs = 0;
boolean containsDistinctAggr = false;
for (Map.Entry<String, ASTNode> entry : aggregationTrees.entrySet()) {
ASTNode value = entry.getValue();
String aggName = unescapeIdentifier(value.getChild(0).getText());
List<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
boolean isDistinct = (value.getType() == HiveParser.TOK_FUNCTIONDI);
containsDistinctAggr = containsDistinctAggr || isDistinct;
// side, so always look for the parameters: d+e
if (isDistinct) {
// 0 is the function name
for (int i = 1; i < value.getChildCount(); i++) {
ASTNode paraExpr = (ASTNode) value.getChild(i);
ColumnInfo paraExprInfo = groupByInputRowResolver.getExpression(paraExpr);
if (paraExprInfo == null) {
throw new SemanticException(ASTErrorUtils.getMsg(ErrorMsg.INVALID_COLUMN.getMsg(), paraExpr));
}
String paraExpression = paraExprInfo.getInternalName();
assert (paraExpression != null);
if (lastKeyColName != null) {
// if aggr is distinct, the parameter is name is constructed as
// KEY.lastKeyColName:<tag>._colx
paraExpression = Utilities.ReduceField.KEY.name() + "." + lastKeyColName + ":" + numDistinctUDFs + "." + getColumnInternalName(i - 1);
}
ExprNodeDesc expr = new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol());
ExprNodeDesc reduceValue = isConstantParameterInAggregationParameters(paraExprInfo.getInternalName(), reduceValues);
if (reduceValue != null) {
// this parameter is a constant
expr = reduceValue;
}
aggParameters.add(expr);
}
} else {
ColumnInfo paraExprInfo = groupByInputRowResolver.getExpression(value);
if (paraExprInfo == null) {
throw new SemanticException(ASTErrorUtils.getMsg(ErrorMsg.INVALID_COLUMN.getMsg(), value));
}
String paraExpression = paraExprInfo.getInternalName();
assert (paraExpression != null);
aggParameters.add(new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol()));
}
if (isDistinct) {
numDistinctUDFs++;
}
Mode amode = groupByDescModeToUDAFMode(mode, isDistinct);
GenericUDAFEvaluator genericUDAFEvaluator = null;
genericUDAFEvaluator = genericUDAFEvaluators.get(entry.getKey());
assert (genericUDAFEvaluator != null);
GenericUDAFInfo udaf = getGenericUDAFInfo(genericUDAFEvaluator, amode, aggParameters);
aggregations.add(new AggregationDesc(aggName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, (mode != GroupByDesc.Mode.FINAL && isDistinct), amode));
String field = getColumnInternalName(groupByKeys.size() + aggregations.size() - 1);
outputColumnNames.add(field);
groupByOutputRowResolver.putExpression(value, new ColumnInfo(field, udaf.returnType, "", false));
}
float groupByMemoryUsage = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
float memoryThreshold = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
float minReductionHashAggr = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRHASHMINREDUCTION);
float minReductionHashAggrLowerBound = HiveConf.getFloatVar(conf, ConfVars.HIVEMAPAGGRHASHMINREDUCTIONLOWERBOUND);
// Nothing special needs to be done for grouping sets if
// this is the final group by operator, and multiple rows corresponding to the
// grouping sets have been generated upstream.
// However, if an addition MR job has been created to handle grouping sets,
// additional rows corresponding to grouping sets need to be created here.
Operator op = putOpInsertMap(OperatorFactory.getAndMakeChild(new GroupByDesc(mode, outputColumnNames, groupByKeys, aggregations, groupByMemoryUsage, memoryThreshold, minReductionHashAggr, minReductionHashAggrLowerBound, groupingSets, groupingSetsPresent && groupingSetsNeedAdditionalMRJob, groupingSetsPosition, containsDistinctAggr), new RowSchema(groupByOutputRowResolver.getColumnInfos()), reduceSinkOperatorInfo), groupByOutputRowResolver);
op.setColumnExprMap(colExprMap);
return op;
}
use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class SemiJoinReductionMerge method createReduceSink.
/**
* Creates a reduce sink operator that emits all columns of the parent as values.
*/
private static ReduceSinkOperator createReduceSink(Operator<?> parentOp, NullOrdering nullOrder) throws SemanticException {
List<ExprNodeDesc> valueCols = new ArrayList<>();
RowSchema parentSchema = parentOp.getSchema();
List<String> outColNames = new ArrayList<>();
for (int i = 0; i < parentSchema.getSignature().size(); i++) {
ColumnInfo colInfo = parentSchema.getSignature().get(i);
ExprNodeColumnDesc colExpr = new ExprNodeColumnDesc(colInfo.getType(), colInfo.getInternalName(), "", false);
valueCols.add(colExpr);
outColNames.add(SemanticAnalyzer.getColumnInternalName(i));
}
ReduceSinkDesc rsDesc = PlanUtils.getReduceSinkDesc(Collections.emptyList(), valueCols, outColNames, false, -1, 0, 1, AcidUtils.Operation.NOT_ACID, nullOrder);
rsDesc.setColumnExprMap(Collections.emptyMap());
return (ReduceSinkOperator) OperatorFactory.getAndMakeChild(rsDesc, new RowSchema(parentSchema), parentOp);
}
use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class SetReducerParallelism method process.
@SuppressWarnings("unchecked")
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
OptimizeTezProcContext context = (OptimizeTezProcContext) procContext;
ReduceSinkOperator sink = (ReduceSinkOperator) nd;
ReduceSinkDesc desc = sink.getConf();
long bytesPerReducer = context.conf.getLongVar(HiveConf.ConfVars.BYTESPERREDUCER);
int maxReducers = context.conf.getIntVar(HiveConf.ConfVars.MAXREDUCERS);
int constantReducers = context.conf.getIntVar(HiveConf.ConfVars.HADOOPNUMREDUCERS);
if (context.visitedReduceSinks.contains(sink)) {
// skip walking the children
LOG.debug("Already processed reduce sink: " + sink.getName());
return true;
}
context.visitedReduceSinks.add(sink);
if (desc.getNumReducers() <= 0) {
if (constantReducers > 0) {
LOG.info("Parallelism for reduce sink " + sink + " set by user to " + constantReducers);
desc.setNumReducers(constantReducers);
} else {
long numberOfBytes = 0;
// we need to add up all the estimates from the siblings of this reduce sink
for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
if (sibling.getStatistics() != null) {
numberOfBytes = StatsUtils.safeAdd(numberOfBytes, sibling.getStatistics().getDataSize());
} else {
LOG.warn("No stats available from: " + sibling);
}
}
int numReducers = Utilities.estimateReducers(numberOfBytes, bytesPerReducer, maxReducers, false);
LOG.info("Set parallelism for reduce sink " + sink + " to: " + numReducers);
desc.setNumReducers(numReducers);
final Collection<ExprNodeDescEqualityWrapper> keyCols = ExprNodeDescEqualityWrapper.transform(desc.getKeyCols());
final Collection<ExprNodeDescEqualityWrapper> partCols = ExprNodeDescEqualityWrapper.transform(desc.getPartitionCols());
if (keyCols != null && keyCols.equals(partCols)) {
desc.setReducerTraits(EnumSet.of(UNIFORM, AUTOPARALLEL));
} else {
desc.setReducerTraits(EnumSet.of(AUTOPARALLEL));
}
}
} else {
LOG.info("Number of reducers determined to be: " + desc.getNumReducers());
// usually controlled by bucketing
desc.setReducerTraits(EnumSet.of(FIXED));
}
return false;
}
use of org.apache.hadoop.hive.ql.plan.ReduceSinkDesc in project hive by apache.
the class TopNKeyPushdownProcessor method pushdownThroughReduceSink.
/**
* Push through ReduceSink. If TopNKey expression is same as ReduceSink expression and order is
* the same, we can push it and remove it from above ReduceSink. If expression in TopNKey shared
* common prefix with ReduceSink including same order, TopNKey could be pushed through
* ReduceSink using that prefix and kept above it.
*
* @param topNKey TopNKey operator to push
* @throws SemanticException when removeChildAndAdoptItsChildren was not successful
*/
private void pushdownThroughReduceSink(TopNKeyOperator topNKey) throws SemanticException {
ReduceSinkOperator reduceSink = (ReduceSinkOperator) topNKey.getParentOperators().get(0);
final ReduceSinkDesc reduceSinkDesc = reduceSink.getConf();
final TopNKeyDesc topNKeyDesc = topNKey.getConf();
CommonKeyPrefix commonKeyPrefix = CommonKeyPrefix.map(topNKeyDesc, reduceSinkDesc);
if (commonKeyPrefix.isEmpty() || commonKeyPrefix.size() == topNKeyDesc.getPartitionKeyColumns().size()) {
return;
}
LOG.debug("Pushing a copy of {} through {}", topNKey.getName(), reduceSink.getName());
final TopNKeyDesc newTopNKeyDesc = topNKeyDesc.combine(commonKeyPrefix);
pushdown((TopNKeyOperator) copyDown(reduceSink, newTopNKeyDesc));
if (topNKeyDesc.getKeyColumns().size() == commonKeyPrefix.size()) {
LOG.debug("Removing {} above {}", topNKey.getName(), reduceSink.getName());
reduceSink.removeChildAndAdoptItsChildren(topNKey);
}
}
Aggregations