use of org.apache.hadoop.hive.ql.stats.estimator.StatEstimatorProvider in project hive by apache.
the class StatsUtils method getColStatisticsFromExpression.
/**
* Get column statistics expression nodes
* @param conf
* - hive conf
* @param parentStats
* - parent statistics
* @param end
* - expression nodes
* @return column statistics
*/
public static ColStatistics getColStatisticsFromExpression(HiveConf conf, Statistics parentStats, ExprNodeDesc end) {
if (end == null) {
return null;
}
String colName = null;
String colType = null;
double avgColSize = 0;
long countDistincts = 0;
long numNulls = 0;
ObjectInspector oi = end.getWritableObjectInspector();
long numRows = parentStats.getNumRows();
if (end instanceof ExprNodeColumnDesc) {
// column projection
ExprNodeColumnDesc encd = (ExprNodeColumnDesc) end;
colName = encd.getColumn();
if (encd.getIsPartitionColOrVirtualCol()) {
ColStatistics colStats = parentStats.getColumnStatisticsFromColName(colName);
if (colStats != null) {
/* If statistics for the column already exist use it. */
return colStats.clone();
}
// virtual columns
colType = encd.getTypeInfo().getTypeName();
countDistincts = numRows;
} else {
// clone the column stats and return
ColStatistics result = parentStats.getColumnStatisticsFromColName(colName);
if (result != null) {
return result.clone();
}
return null;
}
} else if (end instanceof ExprNodeConstantDesc) {
return buildColStatForConstant(conf, numRows, (ExprNodeConstantDesc) end);
} else if (end instanceof ExprNodeGenericFuncDesc) {
ExprNodeGenericFuncDesc engfd = (ExprNodeGenericFuncDesc) end;
colName = engfd.getName();
colType = engfd.getTypeString();
// If it is a widening cast, we do not change NDV, min, max
if (isWideningCast(engfd) && engfd.getChildren().get(0) instanceof ExprNodeColumnDesc) {
// cast on single column
ColStatistics stats = parentStats.getColumnStatisticsFromColName(engfd.getCols().get(0));
if (stats != null) {
ColStatistics newStats;
newStats = stats.clone();
newStats.setColumnName(colName);
colType = colType.toLowerCase();
newStats.setColumnType(colType);
newStats.setAvgColLen(getAvgColLenOf(conf, oi, colType));
return newStats;
}
}
if (conf.getBoolVar(ConfVars.HIVE_STATS_ESTIMATORS_ENABLE)) {
Optional<StatEstimatorProvider> sep = engfd.getGenericUDF().adapt(StatEstimatorProvider.class);
if (sep.isPresent()) {
StatEstimator se = sep.get().getStatEstimator();
List<ColStatistics> csList = new ArrayList<ColStatistics>();
for (ExprNodeDesc child : engfd.getChildren()) {
ColStatistics cs = getColStatisticsFromExpression(conf, parentStats, child);
if (cs == null) {
break;
}
csList.add(cs);
}
if (csList.size() == engfd.getChildren().size()) {
Optional<ColStatistics> res = se.estimate(csList);
if (res.isPresent()) {
ColStatistics newStats = res.get();
colType = colType.toLowerCase();
newStats.setColumnType(colType);
newStats.setColumnName(colName);
return newStats;
}
}
}
}
// fallback to default
countDistincts = getNDVFor(engfd, numRows, parentStats);
} else if (end instanceof ExprNodeColumnListDesc) {
// column list
ExprNodeColumnListDesc encd = (ExprNodeColumnListDesc) end;
colName = Joiner.on(",").join(encd.getCols());
colType = serdeConstants.LIST_TYPE_NAME;
countDistincts = numRows;
} else if (end instanceof ExprNodeFieldDesc) {
// field within complex type
ExprNodeFieldDesc enfd = (ExprNodeFieldDesc) end;
colName = enfd.getFieldName();
colType = enfd.getTypeString();
countDistincts = numRows;
} else if (end instanceof ExprDynamicParamDesc) {
// possible to create colstats object
return null;
} else {
throw new IllegalArgumentException("not supported expr type " + end.getClass());
}
colType = colType.toLowerCase();
avgColSize = getAvgColLenOf(conf, oi, colType);
ColStatistics colStats = new ColStatistics(colName, colType);
colStats.setAvgColLen(avgColSize);
colStats.setCountDistint(countDistincts);
colStats.setNumNulls(numNulls);
return colStats;
}
Aggregations