Search in sources :

Example 41 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class StatsUtils method getTableColumnStats.

/**
 * Get table level column statistics from metastore for needed columns
 * @param table
 *          - table
 * @param schema
 *          - output schema
 * @param neededColumns
 *          - list of needed columns
 * @return column statistics
 */
public static List<ColStatistics> getTableColumnStats(Table table, List<ColumnInfo> schema, List<String> neededColumns, ColumnStatsList colStatsCache) {
    if (table.isMaterializedTable()) {
        LOG.debug("Materialized table does not contain table statistics");
        return null;
    }
    // We will retrieve stats from the metastore only for columns that are not cached
    List<String> colStatsToRetrieve;
    if (colStatsCache != null) {
        colStatsToRetrieve = new ArrayList<>(neededColumns.size());
        for (String colName : neededColumns) {
            if (!colStatsCache.getColStats().containsKey(colName)) {
                colStatsToRetrieve.add(colName);
            }
        }
    } else {
        colStatsToRetrieve = neededColumns;
    }
    // Retrieve stats from metastore
    String dbName = table.getDbName();
    String tabName = table.getTableName();
    List<ColStatistics> stats = null;
    try {
        List<ColumnStatisticsObj> colStat = Hive.get().getTableColumnStatistics(dbName, tabName, colStatsToRetrieve);
        stats = convertColStats(colStat, tabName);
    } catch (HiveException e) {
        LOG.error("Failed to retrieve table statistics: ", e);
        stats = new ArrayList<ColStatistics>();
    }
    // Merge stats from cache with metastore cache
    if (colStatsCache != null) {
        for (String col : neededColumns) {
            ColStatistics cs = colStatsCache.getColStats().get(col);
            if (cs != null) {
                stats.add(cs);
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Stats for column " + cs.getColumnName() + " in table " + table.getCompleteName() + " retrieved from cache");
                }
            }
        }
    }
    return stats;
}
Also used : ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) ArrayList(java.util.ArrayList)

Example 42 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class TezCompiler method getBloomFilterBenefit.

private static double getBloomFilterBenefit(SelectOperator sel, ExprNodeDesc selExpr, FilterOperator fil, ExprNodeDesc tsExpr) {
    double benefit = -1;
    Statistics selStats = sel.getStatistics();
    Statistics filStats = fil.getStatistics();
    if (selStats == null || filStats == null) {
        LOG.debug("No stats available to compute BloomFilter benefit");
        return benefit;
    }
    // For cardinality values use numRows as default, try to use ColStats if available
    long selKeyCardinality = selStats.getNumRows();
    long tsKeyCardinality = filStats.getNumRows();
    long tsRows = filStats.getNumRows();
    long tsRowSize = filStats.getAvgRowSize();
    long keyDomainCardinality = selKeyCardinality + tsKeyCardinality;
    ExprNodeColumnDesc selCol = ExprNodeDescUtils.getColumnExpr(selExpr);
    ExprNodeColumnDesc tsCol = ExprNodeDescUtils.getColumnExpr(tsExpr);
    if (selCol != null && tsCol != null) {
        // Check if there are column stats available for these columns
        ColStatistics selColStat = selStats.getColumnStatisticsFromColName(selCol.getColumn());
        ColStatistics filColStat = filStats.getColumnStatisticsFromColName(tsCol.getColumn());
        if (canUseNDV(selColStat)) {
            selKeyCardinality = selColStat.getCountDistint();
        }
        if (canUseNDV(filColStat)) {
            tsKeyCardinality = filColStat.getCountDistint();
        }
        // Get colstats for the original table column for selCol if possible, this would have
        // more accurate information about the original NDV of the column before any filtering.
        ColStatistics selColSourceStat = null;
        if (selColStat != null) {
            ExprNodeDescUtils.ColumnOrigin selColSource = ExprNodeDescUtils.findColumnOrigin(selCol, sel);
            if (selColSource != null && selColSource.op.getStatistics() != null) {
                selColSourceStat = selColSource.op.getStatistics().getColumnStatisticsFromColName(selColSource.col.getColumn());
            }
        }
        long domainCardinalityFromColStats = getCombinedKeyDomainCardinality(selColStat, selColSourceStat, filColStat);
        if (domainCardinalityFromColStats >= 0) {
            keyDomainCardinality = domainCardinalityFromColStats;
        }
    }
    // Selectivity: key cardinality of semijoin / domain cardinality
    // Benefit (rows filtered from ts): (1 - selectivity) * # ts rows
    double selectivity = selKeyCardinality / (double) keyDomainCardinality;
    selectivity = Math.min(selectivity, 1);
    benefit = tsRows * (1 - selectivity);
    if (LOG.isDebugEnabled()) {
        LOG.debug("BloomFilter benefit for " + selCol + " to " + tsCol + ", selKeyCardinality=" + selKeyCardinality + ", tsKeyCardinality=" + tsKeyCardinality + ", tsRows=" + tsRows + ", keyDomainCardinality=" + keyDomainCardinality);
        LOG.debug("SemiJoin key selectivity=" + selectivity + ", benefit=" + benefit);
    }
    return benefit;
}
Also used : ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) ExprNodeDescUtils(org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils) AnnotateWithStatistics(org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Aggregations

ColStatistics (org.apache.hadoop.hive.ql.plan.ColStatistics)42 ArrayList (java.util.ArrayList)14 Statistics (org.apache.hadoop.hive.ql.plan.Statistics)8 HashSet (java.util.HashSet)5 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)5 HashMap (java.util.HashMap)4 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)4 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)4 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)4 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)4 List (java.util.List)3 ImmutableBitSet (org.apache.calcite.util.ImmutableBitSet)3 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)3 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)3 RelMetadataQuery (org.apache.calcite.rel.metadata.RelMetadataQuery)2 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)2 Partition (org.apache.hadoop.hive.ql.metadata.Partition)2 PartitionIterable (org.apache.hadoop.hive.ql.metadata.PartitionIterable)2 AnnotateWithStatistics (org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics)2 ColumnStatsList (org.apache.hadoop.hive.ql.parse.ColumnStatsList)2