use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class StatsUtils method getTableColumnStats.
/**
* Get table level column statistics from metastore for needed columns
* @param table
* - table
* @param schema
* - output schema
* @param neededColumns
* - list of needed columns
* @return column statistics
*/
public static List<ColStatistics> getTableColumnStats(Table table, List<ColumnInfo> schema, List<String> neededColumns, ColumnStatsList colStatsCache) {
if (table.isMaterializedTable()) {
LOG.debug("Materialized table does not contain table statistics");
return null;
}
// We will retrieve stats from the metastore only for columns that are not cached
List<String> colStatsToRetrieve;
if (colStatsCache != null) {
colStatsToRetrieve = new ArrayList<>(neededColumns.size());
for (String colName : neededColumns) {
if (!colStatsCache.getColStats().containsKey(colName)) {
colStatsToRetrieve.add(colName);
}
}
} else {
colStatsToRetrieve = neededColumns;
}
// Retrieve stats from metastore
String dbName = table.getDbName();
String tabName = table.getTableName();
List<ColStatistics> stats = null;
try {
List<ColumnStatisticsObj> colStat = Hive.get().getTableColumnStatistics(dbName, tabName, colStatsToRetrieve);
stats = convertColStats(colStat, tabName);
} catch (HiveException e) {
LOG.error("Failed to retrieve table statistics: ", e);
stats = new ArrayList<ColStatistics>();
}
// Merge stats from cache with metastore cache
if (colStatsCache != null) {
for (String col : neededColumns) {
ColStatistics cs = colStatsCache.getColStats().get(col);
if (cs != null) {
stats.add(cs);
if (LOG.isDebugEnabled()) {
LOG.debug("Stats for column " + cs.getColumnName() + " in table " + table.getCompleteName() + " retrieved from cache");
}
}
}
}
return stats;
}
use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.
the class TezCompiler method getBloomFilterBenefit.
private static double getBloomFilterBenefit(SelectOperator sel, ExprNodeDesc selExpr, FilterOperator fil, ExprNodeDesc tsExpr) {
double benefit = -1;
Statistics selStats = sel.getStatistics();
Statistics filStats = fil.getStatistics();
if (selStats == null || filStats == null) {
LOG.debug("No stats available to compute BloomFilter benefit");
return benefit;
}
// For cardinality values use numRows as default, try to use ColStats if available
long selKeyCardinality = selStats.getNumRows();
long tsKeyCardinality = filStats.getNumRows();
long tsRows = filStats.getNumRows();
long tsRowSize = filStats.getAvgRowSize();
long keyDomainCardinality = selKeyCardinality + tsKeyCardinality;
ExprNodeColumnDesc selCol = ExprNodeDescUtils.getColumnExpr(selExpr);
ExprNodeColumnDesc tsCol = ExprNodeDescUtils.getColumnExpr(tsExpr);
if (selCol != null && tsCol != null) {
// Check if there are column stats available for these columns
ColStatistics selColStat = selStats.getColumnStatisticsFromColName(selCol.getColumn());
ColStatistics filColStat = filStats.getColumnStatisticsFromColName(tsCol.getColumn());
if (canUseNDV(selColStat)) {
selKeyCardinality = selColStat.getCountDistint();
}
if (canUseNDV(filColStat)) {
tsKeyCardinality = filColStat.getCountDistint();
}
// Get colstats for the original table column for selCol if possible, this would have
// more accurate information about the original NDV of the column before any filtering.
ColStatistics selColSourceStat = null;
if (selColStat != null) {
ExprNodeDescUtils.ColumnOrigin selColSource = ExprNodeDescUtils.findColumnOrigin(selCol, sel);
if (selColSource != null && selColSource.op.getStatistics() != null) {
selColSourceStat = selColSource.op.getStatistics().getColumnStatisticsFromColName(selColSource.col.getColumn());
}
}
long domainCardinalityFromColStats = getCombinedKeyDomainCardinality(selColStat, selColSourceStat, filColStat);
if (domainCardinalityFromColStats >= 0) {
keyDomainCardinality = domainCardinalityFromColStats;
}
}
// Selectivity: key cardinality of semijoin / domain cardinality
// Benefit (rows filtered from ts): (1 - selectivity) * # ts rows
double selectivity = selKeyCardinality / (double) keyDomainCardinality;
selectivity = Math.min(selectivity, 1);
benefit = tsRows * (1 - selectivity);
if (LOG.isDebugEnabled()) {
LOG.debug("BloomFilter benefit for " + selCol + " to " + tsCol + ", selKeyCardinality=" + selKeyCardinality + ", tsKeyCardinality=" + tsKeyCardinality + ", tsRows=" + tsRows + ", keyDomainCardinality=" + keyDomainCardinality);
LOG.debug("SemiJoin key selectivity=" + selectivity + ", benefit=" + benefit);
}
return benefit;
}
Aggregations