use of org.apache.hadoop.hive.ql.exec.ColumnInfo in project hive by apache.
the class SemanticAnalyzer method setupStats.
private void setupStats(TableScanDesc tsDesc, QBParseInfo qbp, Table tab, String alias, RowResolver rwsch) throws SemanticException {
// if it is not analyze command and not column stats, then do not gatherstats
if (!qbp.isAnalyzeCommand() && qbp.getAnalyzeRewrite() == null) {
tsDesc.setGatherStats(false);
} else {
if (HiveConf.getVar(conf, HIVESTATSDBCLASS).equalsIgnoreCase(StatDB.fs.name())) {
String statsTmpLoc = ctx.getTempDirForInterimJobPath(tab.getPath()).toString();
LOG.debug("Set stats collection dir : " + statsTmpLoc);
tsDesc.setTmpStatsDir(statsTmpLoc);
}
tsDesc.setGatherStats(true);
tsDesc.setStatsReliable(conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
// append additional virtual columns for storing statistics
Iterator<VirtualColumn> vcs = VirtualColumn.getStatsRegistry(conf).iterator();
List<VirtualColumn> vcList = new ArrayList<VirtualColumn>();
while (vcs.hasNext()) {
VirtualColumn vc = vcs.next();
rwsch.put(alias, vc.getName(), new ColumnInfo(vc.getName(), vc.getTypeInfo(), alias, true, vc.getIsHidden()));
vcList.add(vc);
}
tsDesc.addVirtualCols(vcList);
String tblName = tab.getTableName();
// Theoretically the key prefix could be any unique string shared
// between TableScanOperator (when publishing) and StatsTask (when aggregating).
// Here we use
// db_name.table_name + partitionSec
// as the prefix for easy of read during explain and debugging.
// Currently, partition spec can only be static partition.
String k = org.apache.hadoop.hive.metastore.utils.MetaStoreUtils.encodeTableName(tblName) + Path.SEPARATOR;
tsDesc.setStatsAggPrefix(tab.getDbName() + "." + k);
// set up WriteEntity for replication
outputs.add(new WriteEntity(tab, WriteEntity.WriteType.DDL_SHARED));
// add WriteEntity for each matching partition
if (tab.isPartitioned()) {
List<String> cols = new ArrayList<String>();
if (qbp.getAnalyzeRewrite() != null) {
List<FieldSchema> partitionCols = tab.getPartCols();
for (FieldSchema fs : partitionCols) {
cols.add(fs.getName());
}
tsDesc.setPartColumns(cols);
return;
}
TableSpec tblSpec = qbp.getTableSpec(alias);
Map<String, String> partSpec = tblSpec.getPartSpec();
if (partSpec != null) {
cols.addAll(partSpec.keySet());
tsDesc.setPartColumns(cols);
} else {
throw new SemanticException(ErrorMsg.NEED_PARTITION_SPECIFICATION.getMsg());
}
List<Partition> partitions = qbp.getTableSpec().partitions;
if (partitions != null) {
for (Partition partn : partitions) {
// inputs.add(new ReadEntity(partn)); // is this needed at all?
LOG.info("XXX: adding part: " + partn);
outputs.add(new WriteEntity(partn, WriteEntity.WriteType.DDL_NO_LOCK));
}
}
}
}
}
use of org.apache.hadoop.hive.ql.exec.ColumnInfo in project hive by apache.
the class SemanticAnalyzer method genGroupByPlanGroupByOperator.
/**
* Generate the GroupByOperator for the Query Block (parseInfo.getXXX(dest)).
* The new GroupByOperator will be a child of the reduceSinkOperatorInfo.
*
* @param mode
* The mode of the aggregation (PARTIAL1 or COMPLETE)
* @param genericUDAFEvaluators
* If not null, this function will store the mapping from Aggregation
* StringTree to the genericUDAFEvaluator in this parameter, so it
* can be used in the next-stage GroupBy aggregations.
* @return the new GroupByOperator
*/
@SuppressWarnings("nls")
private Operator genGroupByPlanGroupByOperator(QBParseInfo parseInfo, String dest, Operator input, ReduceSinkOperator rs, GroupByDesc.Mode mode, Map<String, GenericUDAFEvaluator> genericUDAFEvaluators) throws SemanticException {
RowResolver groupByInputRowResolver = opParseCtx.get(input).getRowResolver();
RowResolver groupByOutputRowResolver = new RowResolver();
groupByOutputRowResolver.setIsExprResolver(true);
ArrayList<ExprNodeDesc> groupByKeys = new ArrayList<ExprNodeDesc>();
ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
ArrayList<String> outputColumnNames = new ArrayList<String>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
List<ASTNode> grpByExprs = getGroupByForClause(parseInfo, dest);
for (int i = 0; i < grpByExprs.size(); ++i) {
ASTNode grpbyExpr = grpByExprs.get(i);
ColumnInfo exprInfo = groupByInputRowResolver.getExpression(grpbyExpr);
if (exprInfo == null) {
throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(grpbyExpr));
}
groupByKeys.add(new ExprNodeColumnDesc(exprInfo.getType(), exprInfo.getInternalName(), "", false));
String field = getColumnInternalName(i);
outputColumnNames.add(field);
ColumnInfo oColInfo = new ColumnInfo(field, exprInfo.getType(), null, false);
groupByOutputRowResolver.putExpression(grpbyExpr, oColInfo);
addAlternateGByKeyMappings(grpbyExpr, oColInfo, input, groupByOutputRowResolver);
colExprMap.put(field, groupByKeys.get(groupByKeys.size() - 1));
}
// For each aggregation
HashMap<String, ASTNode> aggregationTrees = parseInfo.getAggregationExprsForClause(dest);
assert (aggregationTrees != null);
// get the last colName for the reduce KEY
// it represents the column name corresponding to distinct aggr, if any
String lastKeyColName = null;
List<String> inputKeyCols = rs.getConf().getOutputKeyColumnNames();
if (inputKeyCols.size() > 0) {
lastKeyColName = inputKeyCols.get(inputKeyCols.size() - 1);
}
List<ExprNodeDesc> reduceValues = rs.getConf().getValueCols();
int numDistinctUDFs = 0;
for (Map.Entry<String, ASTNode> entry : aggregationTrees.entrySet()) {
ASTNode value = entry.getValue();
// This is the GenericUDAF name
String aggName = unescapeIdentifier(value.getChild(0).getText());
boolean isDistinct = value.getType() == HiveParser.TOK_FUNCTIONDI;
boolean isAllColumns = value.getType() == HiveParser.TOK_FUNCTIONSTAR;
// Convert children to aggParameters
ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
// 0 is the function name
for (int i = 1; i < value.getChildCount(); i++) {
ASTNode paraExpr = (ASTNode) value.getChild(i);
ColumnInfo paraExprInfo = groupByInputRowResolver.getExpression(paraExpr);
if (paraExprInfo == null) {
throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(paraExpr));
}
String paraExpression = paraExprInfo.getInternalName();
assert (paraExpression != null);
if (isDistinct && lastKeyColName != null) {
// if aggr is distinct, the parameter is name is constructed as
// KEY.lastKeyColName:<tag>._colx
paraExpression = Utilities.ReduceField.KEY.name() + "." + lastKeyColName + ":" + numDistinctUDFs + "." + getColumnInternalName(i - 1);
}
ExprNodeDesc expr = new ExprNodeColumnDesc(paraExprInfo.getType(), paraExpression, paraExprInfo.getTabAlias(), paraExprInfo.getIsVirtualCol());
ExprNodeDesc reduceValue = isConstantParameterInAggregationParameters(paraExprInfo.getInternalName(), reduceValues);
if (reduceValue != null) {
// this parameter is a constant
expr = reduceValue;
}
aggParameters.add(expr);
}
if (isDistinct) {
numDistinctUDFs++;
}
Mode amode = groupByDescModeToUDAFMode(mode, isDistinct);
GenericUDAFEvaluator genericUDAFEvaluator = getGenericUDAFEvaluator(aggName, aggParameters, value, isDistinct, isAllColumns);
assert (genericUDAFEvaluator != null);
GenericUDAFInfo udaf = getGenericUDAFInfo(genericUDAFEvaluator, amode, aggParameters);
aggregations.add(new AggregationDesc(aggName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, isDistinct, amode));
String field = getColumnInternalName(groupByKeys.size() + aggregations.size() - 1);
outputColumnNames.add(field);
groupByOutputRowResolver.putExpression(value, new ColumnInfo(field, udaf.returnType, "", false));
// GroupByOperators
if (genericUDAFEvaluators != null) {
genericUDAFEvaluators.put(entry.getKey(), genericUDAFEvaluator);
}
}
float groupByMemoryUsage = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
float memoryThreshold = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
Operator op = putOpInsertMap(OperatorFactory.getAndMakeChild(new GroupByDesc(mode, outputColumnNames, groupByKeys, aggregations, false, groupByMemoryUsage, memoryThreshold, null, false, -1, numDistinctUDFs > 0), new RowSchema(groupByOutputRowResolver.getColumnInfos()), input), groupByOutputRowResolver);
op.setColumnExprMap(colExprMap);
return op;
}
use of org.apache.hadoop.hive.ql.exec.ColumnInfo in project hive by apache.
the class SemanticAnalyzer method genGroupByPlanMapGroupByOperator.
/**
* Generate the map-side GroupByOperator for the Query Block
* (qb.getParseInfo().getXXX(dest)). The new GroupByOperator will be a child
* of the inputOperatorInfo.
*
* @param mode
* The mode of the aggregation (HASH)
* @param genericUDAFEvaluators
* If not null, this function will store the mapping from Aggregation
* StringTree to the genericUDAFEvaluator in this parameter, so it
* can be used in the next-stage GroupBy aggregations.
* @return the new GroupByOperator
*/
@SuppressWarnings("nls")
private Operator genGroupByPlanMapGroupByOperator(QB qb, String dest, List<ASTNode> grpByExprs, Operator inputOperatorInfo, GroupByDesc.Mode mode, Map<String, GenericUDAFEvaluator> genericUDAFEvaluators, List<Long> groupingSetKeys, boolean groupingSetsPresent) throws SemanticException {
RowResolver groupByInputRowResolver = opParseCtx.get(inputOperatorInfo).getRowResolver();
QBParseInfo parseInfo = qb.getParseInfo();
RowResolver groupByOutputRowResolver = new RowResolver();
groupByOutputRowResolver.setIsExprResolver(true);
ArrayList<ExprNodeDesc> groupByKeys = new ArrayList<ExprNodeDesc>();
ArrayList<String> outputColumnNames = new ArrayList<String>();
ArrayList<AggregationDesc> aggregations = new ArrayList<AggregationDesc>();
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
for (int i = 0; i < grpByExprs.size(); ++i) {
ASTNode grpbyExpr = grpByExprs.get(i);
ExprNodeDesc grpByExprNode = genExprNodeDesc(grpbyExpr, groupByInputRowResolver);
if ((grpByExprNode instanceof ExprNodeColumnDesc) && ExprNodeDescUtils.indexOf(grpByExprNode, groupByKeys) >= 0) {
// Skip duplicated grouping keys, it happens when define column alias.
grpByExprs.remove(i--);
continue;
}
groupByKeys.add(grpByExprNode);
String field = getColumnInternalName(i);
outputColumnNames.add(field);
groupByOutputRowResolver.putExpression(grpbyExpr, new ColumnInfo(field, grpByExprNode.getTypeInfo(), "", false));
colExprMap.put(field, groupByKeys.get(groupByKeys.size() - 1));
}
// The grouping set key is present after the grouping keys, before the distinct keys
int groupingSetsPosition = -1;
// for the grouping set (corresponding to the rollup).
if (groupingSetsPresent) {
groupingSetsPosition = groupByKeys.size();
createNewGroupingKey(groupByKeys, outputColumnNames, groupByOutputRowResolver, colExprMap);
}
// If there is a distinctFuncExp, add all parameters to the reduceKeys.
if (!parseInfo.getDistinctFuncExprsForClause(dest).isEmpty()) {
List<ASTNode> list = parseInfo.getDistinctFuncExprsForClause(dest);
for (ASTNode value : list) {
// 0 is function name
for (int i = 1; i < value.getChildCount(); i++) {
ASTNode parameter = (ASTNode) value.getChild(i);
if (groupByOutputRowResolver.getExpression(parameter) == null) {
ExprNodeDesc distExprNode = genExprNodeDesc(parameter, groupByInputRowResolver);
groupByKeys.add(distExprNode);
String field = getColumnInternalName(groupByKeys.size() - 1);
outputColumnNames.add(field);
groupByOutputRowResolver.putExpression(parameter, new ColumnInfo(field, distExprNode.getTypeInfo(), "", false));
colExprMap.put(field, groupByKeys.get(groupByKeys.size() - 1));
}
}
}
}
// For each aggregation
HashMap<String, ASTNode> aggregationTrees = parseInfo.getAggregationExprsForClause(dest);
assert (aggregationTrees != null);
boolean containsDistinctAggr = false;
for (Map.Entry<String, ASTNode> entry : aggregationTrees.entrySet()) {
ASTNode value = entry.getValue();
String aggName = unescapeIdentifier(value.getChild(0).getText());
ArrayList<ExprNodeDesc> aggParameters = new ArrayList<ExprNodeDesc>();
// 0 is the function name
for (int i = 1; i < value.getChildCount(); i++) {
ASTNode paraExpr = (ASTNode) value.getChild(i);
ExprNodeDesc paraExprNode = genExprNodeDesc(paraExpr, groupByInputRowResolver);
aggParameters.add(paraExprNode);
}
boolean isDistinct = value.getType() == HiveParser.TOK_FUNCTIONDI;
containsDistinctAggr = containsDistinctAggr || isDistinct;
boolean isAllColumns = value.getType() == HiveParser.TOK_FUNCTIONSTAR;
Mode amode = groupByDescModeToUDAFMode(mode, isDistinct);
GenericUDAFEvaluator genericUDAFEvaluator = getGenericUDAFEvaluator(aggName, aggParameters, value, isDistinct, isAllColumns);
assert (genericUDAFEvaluator != null);
GenericUDAFInfo udaf = getGenericUDAFInfo(genericUDAFEvaluator, amode, aggParameters);
aggregations.add(new AggregationDesc(aggName.toLowerCase(), udaf.genericUDAFEvaluator, udaf.convertedParameters, isDistinct, amode));
String field = getColumnInternalName(groupByKeys.size() + aggregations.size() - 1);
outputColumnNames.add(field);
if (groupByOutputRowResolver.getExpression(value) == null) {
groupByOutputRowResolver.putExpression(value, new ColumnInfo(field, udaf.returnType, "", false));
}
// GroupByOperators
if (genericUDAFEvaluators != null) {
genericUDAFEvaluators.put(entry.getKey(), genericUDAFEvaluator);
}
}
float groupByMemoryUsage = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRHASHMEMORY);
float memoryThreshold = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVEMAPAGGRMEMORYTHRESHOLD);
Operator op = putOpInsertMap(OperatorFactory.getAndMakeChild(new GroupByDesc(mode, outputColumnNames, groupByKeys, aggregations, false, groupByMemoryUsage, memoryThreshold, groupingSetKeys, groupingSetsPresent, groupingSetsPosition, containsDistinctAggr), new RowSchema(groupByOutputRowResolver.getColumnInfos()), inputOperatorInfo), groupByOutputRowResolver);
op.setColumnExprMap(colExprMap);
return op;
}
use of org.apache.hadoop.hive.ql.exec.ColumnInfo in project hive by apache.
the class SemanticAnalyzer method getNotNullConstraintExpr.
private ExprNodeDesc getNotNullConstraintExpr(Table targetTable, Operator input, String dest) throws SemanticException {
boolean forceNotNullConstraint = conf.getBoolVar(ConfVars.HIVE_ENFORCE_NOT_NULL_CONSTRAINT);
if (!forceNotNullConstraint) {
return null;
}
ImmutableBitSet nullConstraintBitSet = null;
try {
nullConstraintBitSet = getEnabledNotNullConstraints(targetTable);
} catch (Exception e) {
if (e instanceof SemanticException) {
throw (SemanticException) e;
} else {
throw (new RuntimeException(e));
}
}
if (nullConstraintBitSet == null) {
return null;
}
List<ColumnInfo> colInfos = input.getSchema().getSignature();
ExprNodeDesc currUDF = null;
// Add NOT NULL constraints
int constraintIdx = 0;
for (int colExprIdx = 0; colExprIdx < colInfos.size(); colExprIdx++) {
if (updating(dest) && colExprIdx == 0) {
// for updates first column is _rowid
continue;
}
if (nullConstraintBitSet.indexOf(constraintIdx) != -1) {
ExprNodeDesc currExpr = TypeCheckProcFactory.toExprNodeDesc(colInfos.get(colExprIdx));
ExprNodeDesc isNotNullUDF = TypeCheckProcFactory.DefaultExprProcessor.getFuncExprNodeDesc("isnotnull", currExpr);
if (currUDF != null) {
currUDF = TypeCheckProcFactory.DefaultExprProcessor.getFuncExprNodeDesc("and", currUDF, isNotNullUDF);
} else {
currUDF = isNotNullUDF;
}
}
constraintIdx++;
}
return currUDF;
}
use of org.apache.hadoop.hive.ql.exec.ColumnInfo in project hive by apache.
the class SemanticAnalyzer method genGroupByPlanReduceSinkOperator.
/**
* Generate the ReduceSinkOperator for the Group By Query Block
* (qb.getPartInfo().getXXX(dest)). The new ReduceSinkOperator will be a child
* of inputOperatorInfo.
*
* It will put all Group By keys and the distinct field (if any) in the
* map-reduce sort key, and all other fields in the map-reduce value.
*
* @param numPartitionFields
* the number of fields for map-reduce partitioning. This is usually
* the number of fields in the Group By keys.
* @return the new ReduceSinkOperator.
* @throws SemanticException
*/
@SuppressWarnings("nls")
private ReduceSinkOperator genGroupByPlanReduceSinkOperator(QB qb, String dest, Operator inputOperatorInfo, List<ASTNode> grpByExprs, int numPartitionFields, boolean changeNumPartitionFields, int numReducers, boolean mapAggrDone, boolean groupingSetsPresent) throws SemanticException {
RowResolver reduceSinkInputRowResolver = opParseCtx.get(inputOperatorInfo).getRowResolver();
QBParseInfo parseInfo = qb.getParseInfo();
RowResolver reduceSinkOutputRowResolver = new RowResolver();
reduceSinkOutputRowResolver.setIsExprResolver(true);
Map<String, ExprNodeDesc> colExprMap = new HashMap<String, ExprNodeDesc>();
// Pre-compute group-by keys and store in reduceKeys
List<String> outputKeyColumnNames = new ArrayList<String>();
List<String> outputValueColumnNames = new ArrayList<String>();
ArrayList<ExprNodeDesc> reduceKeys = getReduceKeysForReduceSink(grpByExprs, dest, reduceSinkInputRowResolver, reduceSinkOutputRowResolver, outputKeyColumnNames, colExprMap);
int keyLength = reduceKeys.size();
int numOfColsRmedFromkey = grpByExprs.size() - keyLength;
// add a key for reduce sink
if (groupingSetsPresent) {
// Process grouping set for the reduce sink operator
processGroupingSetReduceSinkOperator(reduceSinkInputRowResolver, reduceSinkOutputRowResolver, reduceKeys, outputKeyColumnNames, colExprMap);
if (changeNumPartitionFields) {
numPartitionFields++;
}
}
List<List<Integer>> distinctColIndices = getDistinctColIndicesForReduceSink(parseInfo, dest, reduceKeys, reduceSinkInputRowResolver, reduceSinkOutputRowResolver, outputKeyColumnNames, colExprMap);
ArrayList<ExprNodeDesc> reduceValues = new ArrayList<ExprNodeDesc>();
HashMap<String, ASTNode> aggregationTrees = parseInfo.getAggregationExprsForClause(dest);
if (!mapAggrDone) {
getReduceValuesForReduceSinkNoMapAgg(parseInfo, dest, reduceSinkInputRowResolver, reduceSinkOutputRowResolver, outputValueColumnNames, reduceValues, colExprMap);
} else {
// Put partial aggregation results in reduceValues
int inputField = reduceKeys.size() + numOfColsRmedFromkey;
for (Map.Entry<String, ASTNode> entry : aggregationTrees.entrySet()) {
TypeInfo type = reduceSinkInputRowResolver.getColumnInfos().get(inputField).getType();
ExprNodeColumnDesc exprDesc = new ExprNodeColumnDesc(type, getColumnInternalName(inputField), "", false);
reduceValues.add(exprDesc);
inputField++;
String outputColName = getColumnInternalName(reduceValues.size() - 1);
outputValueColumnNames.add(outputColName);
String internalName = Utilities.ReduceField.VALUE.toString() + "." + outputColName;
reduceSinkOutputRowResolver.putExpression(entry.getValue(), new ColumnInfo(internalName, type, null, false));
colExprMap.put(internalName, exprDesc);
}
}
ReduceSinkOperator rsOp = (ReduceSinkOperator) putOpInsertMap(OperatorFactory.getAndMakeChild(PlanUtils.getReduceSinkDesc(reduceKeys, groupingSetsPresent ? keyLength + 1 : keyLength, reduceValues, distinctColIndices, outputKeyColumnNames, outputValueColumnNames, true, -1, numPartitionFields, numReducers, AcidUtils.Operation.NOT_ACID), new RowSchema(reduceSinkOutputRowResolver.getColumnInfos()), inputOperatorInfo), reduceSinkOutputRowResolver);
rsOp.setColumnExprMap(colExprMap);
return rsOp;
}
Aggregations