use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class StatsUtils method getColStatisticsFromExpression.
/**
* Get column statistics expression nodes
* @param conf
* - hive conf
* @param parentStats
* - parent statistics
* @param end
* - expression nodes
* @return column statistics
*/
public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statistics parentStats, ExprNodeDesc end) {
if (end == null) {
return null;
}
String colName = null;
String colType = null;
double avgColSize = 0;
long countDistincts = 0;
long numNulls = 0;
ObjectInspector oi = end.getWritableObjectInspector();
long numRows = parentStats.getNumRows();
if (end instanceof ExprNodeColumnDesc) {
// column projection
ExprNodeColumnDesc encd = (ExprNodeColumnDesc) end;
colName = encd.getColumn();
if (encd.getIsPartitionColOrVirtualCol()) {
ColStatistics colStats = parentStats.getColumnStatisticsFromColName(colName);
if (colStats != null) {
/* If statistics for the column already exist use it. */
try {
return colStats.clone();
} catch (CloneNotSupportedException e) {
return null;
}
}
// virtual columns
colType = encd.getTypeInfo().getTypeName();
countDistincts = numRows;
} else {
// clone the column stats and return
ColStatistics result = parentStats.getColumnStatisticsFromColName(colName);
if (result != null) {
try {
return result.clone();
} catch (CloneNotSupportedException e) {
return null;
}
}
return null;
}
} else if (end instanceof ExprNodeConstantDesc) {
// constant projection
ExprNodeConstantDesc encd = (ExprNodeConstantDesc) end;
colName = encd.getName();
colType = encd.getTypeString();
if (encd.getValue() == null) {
// null projection
numNulls = numRows;
} else {
countDistincts = 1;
}
} else if (end instanceof ExprNodeGenericFuncDesc) {
ExprNodeGenericFuncDesc engfd = (ExprNodeGenericFuncDesc) end;
colName = engfd.getName();
colType = engfd.getTypeString();
// If it is a widening cast, we do not change NDV, min, max
if (isWideningCast(engfd) && engfd.getChildren().get(0) instanceof ExprNodeColumnDesc) {
// cast on single column
ColStatistics stats = parentStats.getColumnStatisticsFromColName(engfd.getCols().get(0));
if (stats != null) {
ColStatistics newStats;
try {
newStats = stats.clone();
} catch (CloneNotSupportedException e) {
LOG.warn("error cloning stats, this should not happen");
return null;
}
newStats.setColumnName(colName);
colType = colType.toLowerCase();
newStats.setColumnType(colType);
newStats.setAvgColLen(getAvgColLenOf(conf, oi, colType));
return newStats;
}
}
// fallback to default
countDistincts = getNDVFor(engfd, numRows, parentStats);
} else if (end instanceof ExprNodeColumnListDesc) {
// column list
ExprNodeColumnListDesc encd = (ExprNodeColumnListDesc) end;
colName = Joiner.on(",").join(encd.getCols());
colType = serdeConstants.LIST_TYPE_NAME;
countDistincts = numRows;
} else if (end instanceof ExprNodeFieldDesc) {
// field within complex type
ExprNodeFieldDesc enfd = (ExprNodeFieldDesc) end;
colName = enfd.getFieldName();
colType = enfd.getTypeString();
countDistincts = numRows;
} else {
throw new IllegalArgumentException("not supported expr type " + end.getClass());
}
colType = colType.toLowerCase();
avgColSize = getAvgColLenOf(conf, oi, colType);
ColStatistics colStats = new ColStatistics(colName, colType);
colStats.setAvgColLen(avgColSize);
colStats.setCountDistint(countDistincts);
colStats.setNumNulls(numNulls);
return colStats;
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class StatsUtils method getDataSizeFromColumnStats.
/**
* Compute raw data size from column statistics
* @param numRows
* - number of rows
* @param colStats
* - column statistics
* @return raw data size
*/
public static long getDataSizeFromColumnStats(long numRows, List<ColStatistics> colStats) {
long result = 0;
if (numRows <= 0 || colStats == null) {
return result;
}
if (colStats.isEmpty()) {
// in such a case we estimate empty row to be of size of empty java object.
return numRows * JavaDataModel.JAVA64_REF;
}
for (ColStatistics cs : colStats) {
if (cs != null) {
String colTypeLowerCase = cs.getColumnType().toLowerCase();
long nonNullCount = cs.getNumNulls() > 0 ? numRows - cs.getNumNulls() + 1 : numRows;
double sizeOf = 0;
if (colTypeLowerCase.equals(serdeConstants.TINYINT_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.SMALLINT_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.INT_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.BIGINT_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.BOOLEAN_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.FLOAT_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.DOUBLE_TYPE_NAME)) {
sizeOf = cs.getAvgColLen();
} else if (colTypeLowerCase.equals(serdeConstants.STRING_TYPE_NAME) || colTypeLowerCase.startsWith(serdeConstants.VARCHAR_TYPE_NAME) || colTypeLowerCase.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
int acl = (int) Math.round(cs.getAvgColLen());
sizeOf = JavaDataModel.get().lengthForStringOfLength(acl);
} else if (colTypeLowerCase.equals(serdeConstants.BINARY_TYPE_NAME)) {
int acl = (int) Math.round(cs.getAvgColLen());
sizeOf = JavaDataModel.get().lengthForByteArrayOfSize(acl);
} else if (colTypeLowerCase.equals(serdeConstants.TIMESTAMP_TYPE_NAME)) {
sizeOf = JavaDataModel.get().lengthOfTimestamp();
} else if (colTypeLowerCase.startsWith(serdeConstants.DECIMAL_TYPE_NAME)) {
sizeOf = JavaDataModel.get().lengthOfDecimal();
} else if (colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) {
sizeOf = JavaDataModel.get().lengthOfDate();
} else {
sizeOf = cs.getAvgColLen();
}
result = safeAdd(result, safeMult(nonNullCount, sizeOf));
}
}
return result;
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class StatsUtils method getColStatisticsFromExprMap.
/**
* Get column statistics from parent statistics.
* @param conf
* - hive conf
* @param parentStats
* - parent statistics
* @param colExprMap
* - column expression map
* @param rowSchema
* - row schema
* @return column statistics
*/
public static List<ColStatistics> getColStatisticsFromExprMap(HiveConf conf, Statistics parentStats, Map<String, ExprNodeDesc> colExprMap, RowSchema rowSchema) {
List<ColStatistics> cs = Lists.newArrayList();
if (colExprMap != null && rowSchema != null) {
for (ColumnInfo ci : rowSchema.getSignature()) {
String outColName = ci.getInternalName();
ExprNodeDesc end = colExprMap.get(outColName);
ColStatistics colStat = getColStatisticsFromExpression(conf, parentStats, end);
if (colStat != null) {
colStat.setColumnName(outColName);
cs.add(colStat);
}
}
// sometimes RowSchema is empty, so fetch stats of columns in exprMap
for (Entry<String, ExprNodeDesc> pair : colExprMap.entrySet()) {
if (rowSchema.getColumnInfo(pair.getKey()) == null) {
ColStatistics colStat = getColStatisticsFromExpression(conf, parentStats, pair.getValue());
if (colStat != null) {
colStat.setColumnName(pair.getKey());
cs.add(colStat);
}
}
}
return cs;
}
// internal names.
if (colExprMap == null || rowSchema == null) {
if (parentStats.getColumnStats() != null) {
cs.addAll(parentStats.getColumnStats());
}
}
return cs;
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class StatsUtils method getNDVFor.
private static long getNDVFor(ExprNodeGenericFuncDesc engfd, long numRows, Statistics parentStats) {
GenericUDF udf = engfd.getGenericUDF();
if (!FunctionRegistry.isDeterministic(udf)) {
return numRows;
}
List<Long> ndvs = Lists.newArrayList();
Class<?> udfClass = udf instanceof GenericUDFBridge ? ((GenericUDFBridge) udf).getUdfClass() : udf.getClass();
NDV ndv = AnnotationUtils.getAnnotation(udfClass, NDV.class);
long udfNDV = Long.MAX_VALUE;
if (ndv != null) {
udfNDV = ndv.maxNdv();
} else {
for (String col : engfd.getCols()) {
ColStatistics stats = parentStats.getColumnStatisticsFromColName(col);
if (stats != null) {
ndvs.add(stats.getCountDistint());
}
}
}
long countDistincts = ndvs.isEmpty() ? numRows : addWithExpDecay(ndvs);
return Collections.min(Lists.newArrayList(countDistincts, udfNDV, numRows));
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class RelOptHiveTable method updateColStats.
private void updateColStats(Set<Integer> projIndxLst, boolean allowNullColumnForMissingStats) {
List<String> nonPartColNamesThatRqrStats = new ArrayList<String>();
List<Integer> nonPartColIndxsThatRqrStats = new ArrayList<Integer>();
List<String> partColNamesThatRqrStats = new ArrayList<String>();
List<Integer> partColIndxsThatRqrStats = new ArrayList<Integer>();
Set<String> colNamesFailedStats = new HashSet<String>();
// 1. Separate required columns to Non Partition and Partition Cols
ColumnInfo tmp;
for (Integer pi : projIndxLst) {
if (hiveColStatsMap.get(pi) == null) {
if ((tmp = hiveNonPartitionColsMap.get(pi)) != null) {
nonPartColNamesThatRqrStats.add(tmp.getInternalName());
nonPartColIndxsThatRqrStats.add(pi);
} else if ((tmp = hivePartitionColsMap.get(pi)) != null) {
partColNamesThatRqrStats.add(tmp.getInternalName());
partColIndxsThatRqrStats.add(pi);
} else {
noColsMissingStats.getAndIncrement();
String logMsg = "Unable to find Column Index: " + pi + ", in " + hiveTblMetadata.getCompleteName();
LOG.error(logMsg);
throw new RuntimeException(logMsg);
}
}
}
if (null == partitionList) {
// We could be here either because its an unpartitioned table or because
// there are no pruning predicates on a partitioned table.
computePartitionList(hiveConf, null, new HashSet<Integer>());
}
// 2. Obtain Col Stats for Non Partition Cols
if (nonPartColNamesThatRqrStats.size() > 0) {
List<ColStatistics> hiveColStats;
if (!hiveTblMetadata.isPartitioned()) {
// 2.1 Handle the case for unpartitioned table.
hiveColStats = StatsUtils.getTableColumnStats(hiveTblMetadata, hiveNonPartitionCols, nonPartColNamesThatRqrStats);
// 2.1.1 Record Column Names that we needed stats for but couldn't
if (hiveColStats == null) {
colNamesFailedStats.addAll(nonPartColNamesThatRqrStats);
} else if (hiveColStats.size() != nonPartColNamesThatRqrStats.size()) {
Set<String> setOfFiledCols = new HashSet<String>(nonPartColNamesThatRqrStats);
Set<String> setOfObtainedColStats = new HashSet<String>();
for (ColStatistics cs : hiveColStats) {
setOfObtainedColStats.add(cs.getColumnName());
}
setOfFiledCols.removeAll(setOfObtainedColStats);
colNamesFailedStats.addAll(setOfFiledCols);
} else {
// Column stats in hiveColStats might not be in the same order as the columns in
// nonPartColNamesThatRqrStats. reorder hiveColStats so we can build hiveColStatsMap
// using nonPartColIndxsThatRqrStats as below
Map<String, ColStatistics> columnStatsMap = new HashMap<String, ColStatistics>(hiveColStats.size());
for (ColStatistics cs : hiveColStats) {
columnStatsMap.put(cs.getColumnName(), cs);
}
hiveColStats.clear();
for (String colName : nonPartColNamesThatRqrStats) {
hiveColStats.add(columnStatsMap.get(colName));
}
}
} else {
// 2.2 Obtain col stats for partitioned table.
try {
if (partitionList.getNotDeniedPartns().isEmpty()) {
// no need to make a metastore call
rowCount = 0;
hiveColStats = new ArrayList<ColStatistics>();
for (String c : nonPartColNamesThatRqrStats) {
// add empty stats object for each column
hiveColStats.add(new ColStatistics(c, null));
}
colNamesFailedStats.clear();
} else {
Statistics stats = StatsUtils.collectStatistics(hiveConf, partitionList, hiveTblMetadata, hiveNonPartitionCols, nonPartColNamesThatRqrStats, nonPartColNamesThatRqrStats, true, true);
rowCount = stats.getNumRows();
hiveColStats = new ArrayList<ColStatistics>();
for (String c : nonPartColNamesThatRqrStats) {
ColStatistics cs = stats.getColumnStatisticsFromColName(c);
if (cs != null) {
hiveColStats.add(cs);
} else {
colNamesFailedStats.add(c);
}
}
}
} catch (HiveException e) {
String logMsg = "Collecting stats failed.";
LOG.error(logMsg, e);
throw new RuntimeException(logMsg, e);
}
}
if (hiveColStats != null && hiveColStats.size() == nonPartColNamesThatRqrStats.size()) {
for (int i = 0; i < hiveColStats.size(); i++) {
// the columns in nonPartColIndxsThatRqrStats/nonPartColNamesThatRqrStats/hiveColStats
// are in same order
hiveColStatsMap.put(nonPartColIndxsThatRqrStats.get(i), hiveColStats.get(i));
}
}
}
// 3. Obtain Stats for Partition Cols
if (colNamesFailedStats.isEmpty() && !partColNamesThatRqrStats.isEmpty()) {
ColStatistics cStats = null;
for (int i = 0; i < partColNamesThatRqrStats.size(); i++) {
cStats = new ColStatistics(partColNamesThatRqrStats.get(i), hivePartitionColsMap.get(partColIndxsThatRqrStats.get(i)).getTypeName());
cStats.setCountDistint(getDistinctCount(partitionList.getPartitions(), partColNamesThatRqrStats.get(i)));
hiveColStatsMap.put(partColIndxsThatRqrStats.get(i), cStats);
}
}
// 4. Warn user if we could get stats for required columns
if (!colNamesFailedStats.isEmpty()) {
String logMsg = "No Stats for " + hiveTblMetadata.getCompleteName() + ", Columns: " + getColNamesForLogging(colNamesFailedStats);
noColsMissingStats.getAndAdd(colNamesFailedStats.size());
if (allowNullColumnForMissingStats) {
LOG.warn(logMsg);
HiveConf conf = SessionState.getSessionConf();
if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_CBO_SHOW_WARNINGS)) {
LogHelper console = SessionState.getConsole();
console.printInfoNoLog(logMsg);
}
} else {
LOG.error(logMsg);
throw new RuntimeException(logMsg);
}
}
}
Aggregations