Search in sources :

Example 11 with NumDistinctValueEstimator

use of org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator in project hive by apache.

the class LongColumnStatsMerger method merge.

@Override
public void merge(ColumnStatisticsObj aggregateColStats, ColumnStatisticsObj newColStats) {
    LongColumnStatsDataInspector aggregateData = (LongColumnStatsDataInspector) aggregateColStats.getStatsData().getLongStats();
    LongColumnStatsDataInspector newData = (LongColumnStatsDataInspector) newColStats.getStatsData().getLongStats();
    aggregateData.setLowValue(Math.min(aggregateData.getLowValue(), newData.getLowValue()));
    aggregateData.setHighValue(Math.max(aggregateData.getHighValue(), newData.getHighValue()));
    aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
    if (aggregateData.getNdvEstimator() == null || newData.getNdvEstimator() == null) {
        aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
    } else {
        NumDistinctValueEstimator oldEst = aggregateData.getNdvEstimator();
        NumDistinctValueEstimator newEst = newData.getNdvEstimator();
        long ndv = -1;
        if (oldEst.canMerge(newEst)) {
            oldEst.mergeEstimators(newEst);
            ndv = oldEst.estimateNumDistinctValues();
            aggregateData.setNdvEstimator(oldEst);
        } else {
            ndv = Math.max(aggregateData.getNumDVs(), newData.getNumDVs());
        }
        LOG.debug("Use bitvector to merge column " + aggregateColStats.getColName() + "'s ndvs of " + aggregateData.getNumDVs() + " and " + newData.getNumDVs() + " to be " + ndv);
        aggregateData.setNumDVs(ndv);
    }
}
Also used : LongColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector) NumDistinctValueEstimator(org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator)

Aggregations

NumDistinctValueEstimator (org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator)11 HashMap (java.util.HashMap)5 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)5 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)5 ColStatsObjWithSourceInfo (org.apache.hadoop.hive.metastore.utils.MetaStoreUtils.ColStatsObjWithSourceInfo)5 DateColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector)2 DecimalColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector)2 DoubleColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector)2 LongColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector)2 StringColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector)2 ByteArrayInputStream (java.io.ByteArrayInputStream)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 Date (org.apache.hadoop.hive.metastore.api.Date)1 DateColumnStatsData (org.apache.hadoop.hive.metastore.api.DateColumnStatsData)1 Decimal (org.apache.hadoop.hive.metastore.api.Decimal)1 DecimalColumnStatsData (org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData)1 DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)1 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)1