use of org.apache.hadoop.hive.ql.exec.RowSchema in project hive by apache.
the class SemanticAnalyzer method genGroupByPlanMapGroupByOperator.
/**
* Generate the map-side GroupByOperator for the Query Block
* (qb.getParseInfo().getXXX(dest)). The new GroupByOperator will be a child
* of the inputOperatorInfo.
*
* @param mode
* The mode of the aggregation (HASH)
* @param genericUDAFEvaluators
* If not null, this function will store the mapping from Aggregation
* StringTree to the genericUDAFEvaluator in this parameter, so it
* can be used in the next-stage GroupBy aggregations.
* @return the new GroupByOperator
*/
@SuppressWarnings("nls")
private Operator genGroupByPlanMapGroupByOperator(QB qb, String dest, List<ASTNode> grpByExprs, Operator inputOperatorInfo, GroupByDesc.Mode mode, Map<String, GenericUDAFEvaluator> genericUDAFEvaluators, List<Integer> groupingSetKeys, boolean groupingSetsPresent) throws SemanticException {
RowResolver groupByInputRowResolver = opParseCtx.get(inputOperatorInfo).getRowResolver();
QBParseInfo parseInfo = qb.getParseInfo();
RowResolver groupByOutputRowResolver = new RowResolver();
groupByOutputRowResolver.setIsExprResolver(true);
ArrayList<ExprNodeDesc> groupByKeys = new ArrayList<ExprNodeDesc>();
ArrayList<String> outputColumnNames = new ArrayList<String>();
ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
for (int i = 0; i < grpByExprs.size(); ++i) {
ASTNode grpbyExpr = grpByExprs.get(i);
ExprNodeDesc grpByExprNode = genExprNodeDesc(grpbyExpr, groupByInputRowResolver);
if ((grpByExprNode instanceof ExprNodeColumnDesc) && ExprNodeDescUtils.indexOf(grpByExprNode, groupByKeys) >= 0) {
// Skip duplicated grouping keys, it happens when define column alias.
grpByExprs.remove(i--);
continue;
}
groupByKeys.add(grpByExprNode);
String field = getColumnInternalName(i);
outputColumnNames.add(field);
groupByOutputRowResolver.putExpression(grpbyExpr, new ColumnInfo(field, grpByExprNode.getTypeInfo(), "", false));
colExprMap.put(field, groupByKeys.get(groupByKeys.size() - 1));
}
// The grouping set key is present after the grouping keys, before the distinct keys
int groupingSetsPosition = -1;
// for the grouping set (corresponding to the rollup).
if (groupingSetsPresent) {
groupingSetsPosition = groupByKeys.size();
createNewGroupingKey(groupByKeys, outputColumnNames, groupByOutputRowResolver, colExprMap);
}
// If there is a distinctFuncExp, add all parameters to the reduceKeys.
if (!parseInfo.getDistinctFuncExprsForClause(dest).isEmpty()) {
List<ASTNode> list = parseInfo.getDistinctFuncExprsForClause(dest);
for (ASTNode value : list) {
// 0 is function name
for (int i = 1; i < value.getChildCount(); i++) {
ASTNode parameter = (ASTNode) value.getChild(i);
if (groupByOutputRowResolver.getExpression(parameter) == null) {
ExprNodeDesc distExprNode = genExprNodeDesc(parameter, groupByInputRowResolver);
groupByKeys.add(distExprNode);
String field = getColumnInternalName(groupByKeys.size() - 1);
outputColumnNames.add(field);
groupByOutputRowResolver.putExpression(parameter, new ColumnInfo(field, distExprNode.getTypeInfo(), "", false));
colExprMap.put(field, groupByKeys.get(groupByKeys.size() - 1));
}
}
}
}
// For each aggregation
HashMap<String, ASTNode> aggregationTrees = parseInfo.getAggregationExprsForClause(dest);
assert (aggregationTrees != null);
boolean containsDistinctAggr = false;
for (Map.Entry<String, ASTNode> entry : aggregationTrees.entrySet()) {
ASTNode value = entry.getValue();
String aggName = unescapeIdentifier(value.getChild(0).getText());
ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
// 0 is the function name
for (int i = 1; i < value.getChildCount(); i++) {
ASTNode paraExpr = (ASTNode) value.getChild(i);
ExprNodeDesc paraExprNode = genExprNodeDesc(paraExpr, groupByInputRowResolver);
aggParameters.add(paraExprNode);
}
boolean isDistinct = value.getType() == HiveParser.TOK_FUNCTIONDI;
containsDistinctAggr = containsDistinctAggr || isDistinct;
boolean isAllColumns = value.getType() == HiveParser.TOK_FUNCTIONSTAR;
Mode amode = groupByDescModeToUDAFMode(mode, isDistinct);
GenericUDAFEvaluator genericUDAFEvaluator = getGenericUDAFEvaluator(aggName, aggParameters, value, isDistinct, isAllColumns);
assert (genericUDAFEvaluator != null);
GenericUDAFInfo udaf = getGenericUDAFInfo(genericUDAFEvaluator, amode, aggParameters);
aggregations.add(new AggregationDesc(aggName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, isDistinct, amode));
String field = getColumnInternalName(groupByKeys.size() + aggregations.size() - 1);
outputColumnNames.add(field);
if (groupByOutputRowResolver.getExpression(value) == null) {
groupByOutputRowResolver.putExpression(value, new ColumnInfo(field, udaf.returnType, "", false));
}
// GroupByOperators
if (genericUDAFEvaluators != null) {
genericUDAFEvaluators.put(entry.getKey(), genericUDAFEvaluator);
}
}
float groupByMemoryUsage = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
float memoryThreshold = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
Operator op = putOpInsertMap(OperatorFactory.getAndMakeChild(new GroupByDesc(mode, outputColumnNames, groupByKeys, aggregations, false, groupByMemoryUsage, memoryThreshold, groupingSetKeys, groupingSetsPresent, groupingSetsPosition, containsDistinctAggr), new RowSchema(groupByOutputRowResolver.getColumnInfos()), inputOperatorInfo), groupByOutputRowResolver);
op.setColumnExprMap(colExprMap);
return op;
}
use of org.apache.hadoop.hive.ql.exec.RowSchema in project hive by apache.
the class SemanticAnalyzer method genGroupByPlanReduceSinkOperator2MR.
/**
* Generate the second ReduceSinkOperator for the Group By Plan
* (parseInfo.getXXX(dest)). The new ReduceSinkOperator will be a child of
* groupByOperatorInfo.
*
* The second ReduceSinkOperator will put the group by keys in the map-reduce
* sort key, and put the partial aggregation results in the map-reduce value.
*
* @param numPartitionFields
* the number of fields in the map-reduce partition key. This should
* always be the same as the number of Group By keys. We should be
* able to remove this parameter since in this phase there is no
* distinct any more.
* @return the new ReduceSinkOperator.
* @throws SemanticException
*/
@SuppressWarnings("nls")
private Operator genGroupByPlanReduceSinkOperator2MR(QBParseInfo parseInfo, String dest, Operator groupByOperatorInfo, int numPartitionFields, int numReducers, boolean groupingSetsPresent) throws SemanticException {
RowResolver reduceSinkInputRowResolver2 = opParseCtx.get(groupByOperatorInfo).getRowResolver();
RowResolver reduceSinkOutputRowResolver2 = new RowResolver();
reduceSinkOutputRowResolver2.setIsExprResolver(true);
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
ArrayList<ExprNodeDesc> reduceKeys = new ArrayList<ExprNodeDesc>();
ArrayList<String> outputColumnNames = new ArrayList<String>();
// Get group-by keys and store in reduceKeys
List<ASTNode> grpByExprs = getGroupByForClause(parseInfo, dest);
for (int i = 0; i < grpByExprs.size(); ++i) {
ASTNode grpbyExpr = grpByExprs.get(i);
String field = getColumnInternalName(i);
outputColumnNames.add(field);
TypeInfo typeInfo = reduceSinkInputRowResolver2.getExpression(grpbyExpr).getType();
ExprNodeColumnDesc inputExpr = new ExprNodeColumnDesc(typeInfo, field, "", false);
reduceKeys.add(inputExpr);
ColumnInfo colInfo = new ColumnInfo(Utilities.ReduceField.KEY.toString() + "." + field, typeInfo, "", false);
reduceSinkOutputRowResolver2.putExpression(grpbyExpr, colInfo);
colExprMap.put(colInfo.getInternalName(), inputExpr);
}
// add a key for reduce sink
if (groupingSetsPresent) {
// Note that partitioning fields dont need to change, since it is either
// partitioned randomly, or by all grouping keys + distinct keys
processGroupingSetReduceSinkOperator(reduceSinkInputRowResolver2, reduceSinkOutputRowResolver2, reduceKeys, outputColumnNames, colExprMap);
}
// Get partial aggregation results and store in reduceValues
ArrayList<ExprNodeDesc> reduceValues = new ArrayList<ExprNodeDesc>();
int inputField = reduceKeys.size();
HashMap<String, ASTNode> aggregationTrees = parseInfo.getAggregationExprsForClause(dest);
for (Map.Entry<String, ASTNode> entry : aggregationTrees.entrySet()) {
String field = getColumnInternalName(inputField);
ASTNode t = entry.getValue();
TypeInfo typeInfo = reduceSinkInputRowResolver2.getExpression(t).getType();
ExprNodeColumnDesc exprDesc = new ExprNodeColumnDesc(typeInfo, field, "", false);
reduceValues.add(exprDesc);
inputField++;
String col = getColumnInternalName(reduceValues.size() - 1);
outputColumnNames.add(col);
reduceSinkOutputRowResolver2.putExpression(t, new ColumnInfo(Utilities.ReduceField.VALUE.toString() + "." + col, typeInfo, "", false));
colExprMap.put(col, exprDesc);
}
ReduceSinkOperator rsOp = (ReduceSinkOperator) putOpInsertMap(OperatorFactory.getAndMakeChild(PlanUtils.getReduceSinkDesc(reduceKeys, reduceValues, outputColumnNames, true, -1, numPartitionFields, numReducers, AcidUtils.Operation.NOT_ACID), new RowSchema(reduceSinkOutputRowResolver2.getColumnInfos()), groupByOperatorInfo), reduceSinkOutputRowResolver2);
rsOp.setColumnExprMap(colExprMap);
return rsOp;
}
use of org.apache.hadoop.hive.ql.exec.RowSchema in project hive by apache.
the class SemanticAnalyzer method genGroupByPlanGroupByOperator.
/**
* Generate the GroupByOperator for the Query Block (parseInfo.getXXX(dest)).
* The new GroupByOperator will be a child of the reduceSinkOperatorInfo.
*
* @param mode
* The mode of the aggregation (PARTIAL1 or COMPLETE)
* @param genericUDAFEvaluators
* If not null, this function will store the mapping from Aggregation
* StringTree to the genericUDAFEvaluator in this parameter, so it
* can be used in the next-stage GroupBy aggregations.
* @return the new GroupByOperator
*/
@SuppressWarnings("nls")
private Operator genGroupByPlanGroupByOperator(QBParseInfo parseInfo, String dest, Operator input, ReduceSinkOperator rs, GroupByDesc.Mode mode, Map<String, GenericUDAFEvaluator> genericUDAFEvaluators) throws SemanticException {
RowResolver groupByInputRowResolver = opParseCtx.get(input).getRowResolver();
RowResolver groupByOutputRowResolver = new RowResolver();
groupByOutputRowResolver.setIsExprResolver(true);
ArrayList<ExprNodeDesc> groupByKeys = new ArrayList<ExprNodeDesc>();
ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
ArrayList<String> outputColumnNames = new ArrayList<String>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
List<ASTNode> grpByExprs = getGroupByForClause(parseInfo, dest);
for (int i = 0; i < grpByExprs.size(); ++i) {
ASTNode grpbyExpr = grpByExprs.get(i);
ColumnInfo exprInfo = groupByInputRowResolver.getExpression(grpbyExpr);
if (exprInfo == null) {
throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(grpbyExpr));
}
groupByKeys.add(new ExprNodeColumnDesc(exprInfo.getType(), exprInfo.getInternalName(), "", false));
String field = getColumnInternalName(i);
outputColumnNames.add(field);
ColumnInfo oColInfo = new ColumnInfo(field, exprInfo.getType(), null, false);
groupByOutputRowResolver.putExpression(grpbyExpr, oColInfo);
addAlternateGByKeyMappings(grpbyExpr, oColInfo, input, groupByOutputRowResolver);
colExprMap.put(field, groupByKeys.get(groupByKeys.size() - 1));
}
// For each aggregation
HashMap<String, ASTNode> aggregationTrees = parseInfo.getAggregationExprsForClause(dest);
assert (aggregationTrees != null);
// get the last colName for the reduce KEY
// it represents the column name corresponding to distinct aggr, if any
String lastKeyColName = null;
List<String> inputKeyCols = rs.getConf().getOutputKeyColumnNames();
if (inputKeyCols.size() > 0) {
lastKeyColName = inputKeyCols.get(inputKeyCols.size() - 1);
}
List<ExprNodeDesc> reduceValues = rs.getConf().getValueCols();
int numDistinctUDFs = 0;
for (Map.Entry<String, ASTNode> entry : aggregationTrees.entrySet()) {
ASTNode value = entry.getValue();
// This is the GenericUDAF name
String aggName = unescapeIdentifier(value.getChild(0).getText());
boolean isDistinct = value.getType() == HiveParser.TOK_FUNCTIONDI;
boolean isAllColumns = value.getType() == HiveParser.TOK_FUNCTIONSTAR;
// Convert children to aggParameters
ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
// 0 is the function name
for (int i = 1; i < value.getChildCount(); i++) {
ASTNode paraExpr = (ASTNode) value.getChild(i);
ColumnInfo paraExprInfo = groupByInputRowResolver.getExpression(paraExpr);
if (paraExprInfo == null) {
throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(paraExpr));
}
String paraExpression = paraExprInfo.getInternalName();
assert (paraExpression != null);
if (isDistinct && lastKeyColName != null) {
// if aggr is distinct, the parameter is name is constructed as
// KEY.lastKeyColName:<tag>._colx
paraExpression = Utilities.ReduceField.KEY.name() + "." + lastKeyColName + ":" + numDistinctUDFs + "." + getColumnInternalName(i - 1);
}
ExprNodeDesc expr = new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol());
ExprNodeDesc reduceValue = isConstantParameterInAggregationParameters(paraExprInfo.getInternalName(), reduceValues);
if (reduceValue != null) {
// this parameter is a constant
expr = reduceValue;
}
aggParameters.add(expr);
}
if (isDistinct) {
numDistinctUDFs++;
}
Mode amode = groupByDescModeToUDAFMode(mode, isDistinct);
GenericUDAFEvaluator genericUDAFEvaluator = getGenericUDAFEvaluator(aggName, aggParameters, value, isDistinct, isAllColumns);
assert (genericUDAFEvaluator != null);
GenericUDAFInfo udaf = getGenericUDAFInfo(genericUDAFEvaluator, amode, aggParameters);
aggregations.add(new AggregationDesc(aggName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, isDistinct, amode));
String field = getColumnInternalName(groupByKeys.size() + aggregations.size() - 1);
outputColumnNames.add(field);
groupByOutputRowResolver.putExpression(value, new ColumnInfo(field, udaf.returnType, "", false));
// GroupByOperators
if (genericUDAFEvaluators != null) {
genericUDAFEvaluators.put(entry.getKey(), genericUDAFEvaluator);
}
}
float groupByMemoryUsage = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
float memoryThreshold = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
Operator op = putOpInsertMap(OperatorFactory.getAndMakeChild(new GroupByDesc(mode, outputColumnNames, groupByKeys, aggregations, false, groupByMemoryUsage, memoryThreshold, null, false, -1, numDistinctUDFs > 0), new RowSchema(groupByOutputRowResolver.getColumnInfos()), input), groupByOutputRowResolver);
op.setColumnExprMap(colExprMap);
return op;
}
use of org.apache.hadoop.hive.ql.exec.RowSchema in project hive by apache.
the class SemanticAnalyzer method genNotNullFilterForJoinSourcePlan.
/*
* for inner joins push a 'is not null predicate' to the join sources for
* every non nullSafe predicate.
*/
private Operator genNotNullFilterForJoinSourcePlan(QB qb, Operator input, QBJoinTree joinTree, ExprNodeDesc[] joinKeys) throws SemanticException {
if (qb == null || joinTree == null) {
return input;
}
if (!joinTree.getNoOuterJoin()) {
return input;
}
if (joinKeys == null || joinKeys.length == 0) {
return input;
}
Map<Integer, ExprNodeDesc> hashes = new HashMap<Integer, ExprNodeDesc>();
if (input instanceof FilterOperator) {
ExprNodeDescUtils.getExprNodeColumnDesc(Arrays.asList(((FilterDesc) input.getConf()).getPredicate()), hashes);
}
ExprNodeDesc filterPred = null;
List<Boolean> nullSafes = joinTree.getNullSafes();
for (int i = 0; i < joinKeys.length; i++) {
if (nullSafes.get(i) || (joinKeys[i] instanceof ExprNodeColumnDesc && ((ExprNodeColumnDesc) joinKeys[i]).getIsPartitionColOrVirtualCol())) {
// virtual column, since those columns can never be null.
continue;
}
if (null != hashes.get(joinKeys[i].hashCode())) {
// there is already a predicate on this src.
continue;
}
List<ExprNodeDesc> args = new ArrayList<ExprNodeDesc>();
args.add(joinKeys[i]);
ExprNodeDesc nextExpr = ExprNodeGenericFuncDesc.newInstance(FunctionRegistry.getFunctionInfo("isnotnull").getGenericUDF(), args);
filterPred = filterPred == null ? nextExpr : ExprNodeDescUtils.mergePredicates(filterPred, nextExpr);
}
if (filterPred == null) {
return input;
}
OpParseContext inputCtx = opParseCtx.get(input);
RowResolver inputRR = inputCtx.getRowResolver();
if (input instanceof FilterOperator) {
FilterOperator f = (FilterOperator) input;
List<ExprNodeDesc> preds = new ArrayList<ExprNodeDesc>();
preds.add(f.getConf().getPredicate());
preds.add(filterPred);
f.getConf().setPredicate(ExprNodeDescUtils.mergePredicates(preds));
return input;
}
FilterDesc filterDesc = new FilterDesc(filterPred, false);
filterDesc.setGenerated(true);
Operator output = putOpInsertMap(OperatorFactory.getAndMakeChild(filterDesc, new RowSchema(inputRR.getColumnInfos()), input), inputRR);
if (LOG.isDebugEnabled()) {
LOG.debug("Created Filter Plan for " + qb.getId() + " row schema: " + inputRR.toString());
}
return output;
}
use of org.apache.hadoop.hive.ql.exec.RowSchema in project hive by apache.
the class SemanticAnalyzer method genLateralViewPlan.
private Operator genLateralViewPlan(QB qb, Operator op, ASTNode lateralViewTree) throws SemanticException {
RowResolver lvForwardRR = new RowResolver();
RowResolver source = opParseCtx.get(op).getRowResolver();
Map<String, ExprNodeDesc> lvfColExprMap = new HashMap<String, ExprNodeDesc>();
Map<String, ExprNodeDesc> selColExprMap = new HashMap<String, ExprNodeDesc>();
List<ExprNodeDesc> colList = new ArrayList<ExprNodeDesc>();
List<String> colNames = new ArrayList<String>();
for (ColumnInfo col : source.getColumnInfos()) {
String[] tabCol = source.reverseLookup(col.getInternalName());
lvForwardRR.put(tabCol[0], tabCol[1], col);
ExprNodeDesc colExpr = new ExprNodeColumnDesc(col);
colList.add(colExpr);
colNames.add(colExpr.getName());
lvfColExprMap.put(col.getInternalName(), colExpr);
selColExprMap.put(col.getInternalName(), colExpr.clone());
}
Operator lvForward = putOpInsertMap(OperatorFactory.getAndMakeChild(new LateralViewForwardDesc(), new RowSchema(lvForwardRR.getColumnInfos()), op), lvForwardRR);
lvForward.setColumnExprMap(lvfColExprMap);
// The order in which the two paths are added is important. The
// lateral view join operator depends on having the select operator
// give it the row first.
// Get the all path by making a select(*).
RowResolver allPathRR = opParseCtx.get(lvForward).getRowResolver();
// Operator allPath = op;
SelectDesc sDesc = new SelectDesc(colList, colNames, false);
sDesc.setSelStarNoCompute(true);
Operator allPath = putOpInsertMap(OperatorFactory.getAndMakeChild(sDesc, new RowSchema(allPathRR.getColumnInfos()), lvForward), allPathRR);
allPath.setColumnExprMap(selColExprMap);
int allColumns = allPathRR.getColumnInfos().size();
// Get the UDTF Path
QB blankQb = new QB(null, null, false);
Operator udtfPath = genSelectPlan(null, (ASTNode) lateralViewTree.getChild(0), blankQb, lvForward, null, lateralViewTree.getType() == HiveParser.TOK_LATERAL_VIEW_OUTER);
// add udtf aliases to QB
for (String udtfAlias : blankQb.getAliases()) {
qb.addAlias(udtfAlias);
}
RowResolver udtfPathRR = opParseCtx.get(udtfPath).getRowResolver();
// Merge the two into the lateral view join
// The cols of the merged result will be the combination of both the
// cols of the UDTF path and the cols of the all path. The internal
// names have to be changed to avoid conflicts
RowResolver lateralViewRR = new RowResolver();
ArrayList<String> outputInternalColNames = new ArrayList<String>();
// For PPD, we need a column to expression map so that during the walk,
// the processor knows how to transform the internal col names.
// Following steps are dependant on the fact that we called
// LVmerge.. in the above order
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
LVmergeRowResolvers(allPathRR, lateralViewRR, colExprMap, outputInternalColNames);
LVmergeRowResolvers(udtfPathRR, lateralViewRR, colExprMap, outputInternalColNames);
Operator lateralViewJoin = putOpInsertMap(OperatorFactory.getAndMakeChild(new LateralViewJoinDesc(allColumns, outputInternalColNames), new RowSchema(lateralViewRR.getColumnInfos()), allPath, udtfPath), lateralViewRR);
lateralViewJoin.setColumnExprMap(colExprMap);
return lateralViewJoin;
}
Aggregations