Search in sources :

Example 6 with NumDistinctValueEstimator

use of org.apache.hadoop.hive.metastore.NumDistinctValueEstimator in project hive by apache.

the class ColumnStatsMergerFactory method getColumnStatsMerger.

public static ColumnStatsMerger getColumnStatsMerger(ColumnStatisticsObj statsObjNew, ColumnStatisticsObj statsObjOld) {
    ColumnStatsMerger agg;
    _Fields typeNew = statsObjNew.getStatsData().getSetField();
    _Fields typeOld = statsObjOld.getStatsData().getSetField();
    // make sure that they have the same type
    typeNew = typeNew == typeOld ? typeNew : null;
    int numBitVectors = 0;
    switch(typeNew) {
        case BOOLEAN_STATS:
            agg = new BooleanColumnStatsMerger();
            break;
        case LONG_STATS:
            {
                agg = new LongColumnStatsMerger();
                int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getLongStats().getBitVectors());
                int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getLongStats().getBitVectors());
                numBitVectors = nbvNew == nbvOld ? nbvNew : 0;
                break;
            }
        case DOUBLE_STATS:
            {
                agg = new DoubleColumnStatsMerger();
                int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getDoubleStats().getBitVectors());
                int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getDoubleStats().getBitVectors());
                numBitVectors = nbvNew == nbvOld ? nbvNew : 0;
                break;
            }
        case STRING_STATS:
            {
                agg = new StringColumnStatsMerger();
                int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getStringStats().getBitVectors());
                int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getStringStats().getBitVectors());
                numBitVectors = nbvNew == nbvOld ? nbvNew : 0;
                break;
            }
        case BINARY_STATS:
            agg = new BinaryColumnStatsMerger();
            break;
        case DECIMAL_STATS:
            {
                agg = new DecimalColumnStatsMerger();
                int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getDecimalStats().getBitVectors());
                int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getDecimalStats().getBitVectors());
                numBitVectors = nbvNew == nbvOld ? nbvNew : 0;
                break;
            }
        default:
            throw new RuntimeException("Woh, bad.  Unknown stats type " + typeNew.toString());
    }
    if (numBitVectors > 0) {
        agg.ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
    }
    return agg;
}
Also used : ColumnStatisticsData._Fields(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData._Fields) NumDistinctValueEstimator(org.apache.hadoop.hive.metastore.NumDistinctValueEstimator)

Example 7 with NumDistinctValueEstimator

use of org.apache.hadoop.hive.metastore.NumDistinctValueEstimator in project hive by apache.

the class DoubleColumnStatsMerger method merge.

@Override
public void merge(ColumnStatisticsObj aggregateColStats, ColumnStatisticsObj newColStats) {
    DoubleColumnStatsData aggregateData = aggregateColStats.getStatsData().getDoubleStats();
    DoubleColumnStatsData newData = newColStats.getStatsData().getDoubleStats();
    aggregateData.setLowValue(Math.min(aggregateData.getLowValue(), newData.getLowValue()));
    aggregateData.setHighValue(Math.max(aggregateData.getHighValue(), newData.getHighValue()));
    aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
    if (ndvEstimator == null || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) {
        aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
    } else {
        ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(aggregateData.getBitVectors(), ndvEstimator.getnumBitVectors()));
        ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), ndvEstimator.getnumBitVectors()));
        long ndv = ndvEstimator.estimateNumDistinctValues();
        LOG.debug("Use bitvector to merge column " + aggregateColStats.getColName() + "'s ndvs of " + aggregateData.getNumDVs() + " and " + newData.getNumDVs() + " to be " + ndv);
        aggregateData.setNumDVs(ndv);
        aggregateData.setBitVectors(ndvEstimator.serialize().toString());
    }
}
Also used : DoubleColumnStatsData(org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData) NumDistinctValueEstimator(org.apache.hadoop.hive.metastore.NumDistinctValueEstimator)

Example 8 with NumDistinctValueEstimator

use of org.apache.hadoop.hive.metastore.NumDistinctValueEstimator in project hive by apache.

the class LongColumnStatsMerger method merge.

@Override
public void merge(ColumnStatisticsObj aggregateColStats, ColumnStatisticsObj newColStats) {
    LongColumnStatsData aggregateData = aggregateColStats.getStatsData().getLongStats();
    LongColumnStatsData newData = newColStats.getStatsData().getLongStats();
    aggregateData.setLowValue(Math.min(aggregateData.getLowValue(), newData.getLowValue()));
    aggregateData.setHighValue(Math.max(aggregateData.getHighValue(), newData.getHighValue()));
    aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
    if (ndvEstimator == null || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) {
        aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
    } else {
        ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(aggregateData.getBitVectors(), ndvEstimator.getnumBitVectors()));
        ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), ndvEstimator.getnumBitVectors()));
        long ndv = ndvEstimator.estimateNumDistinctValues();
        LOG.debug("Use bitvector to merge column " + aggregateColStats.getColName() + "'s ndvs of " + aggregateData.getNumDVs() + " and " + newData.getNumDVs() + " to be " + ndv);
        aggregateData.setNumDVs(ndv);
        aggregateData.setBitVectors(ndvEstimator.serialize().toString());
    }
}
Also used : LongColumnStatsData(org.apache.hadoop.hive.metastore.api.LongColumnStatsData) NumDistinctValueEstimator(org.apache.hadoop.hive.metastore.NumDistinctValueEstimator)

Example 9 with NumDistinctValueEstimator

use of org.apache.hadoop.hive.metastore.NumDistinctValueEstimator in project hive by apache.

the class StringColumnStatsMerger method merge.

@Override
public void merge(ColumnStatisticsObj aggregateColStats, ColumnStatisticsObj newColStats) {
    StringColumnStatsData aggregateData = aggregateColStats.getStatsData().getStringStats();
    StringColumnStatsData newData = newColStats.getStatsData().getStringStats();
    aggregateData.setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen()));
    aggregateData.setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen()));
    aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
    if (ndvEstimator == null || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) {
        aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
    } else {
        ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(aggregateData.getBitVectors(), ndvEstimator.getnumBitVectors()));
        ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), ndvEstimator.getnumBitVectors()));
        long ndv = ndvEstimator.estimateNumDistinctValues();
        LOG.debug("Use bitvector to merge column " + aggregateColStats.getColName() + "'s ndvs of " + aggregateData.getNumDVs() + " and " + newData.getNumDVs() + " to be " + ndv);
        aggregateData.setNumDVs(ndv);
        aggregateData.setBitVectors(ndvEstimator.serialize().toString());
    }
}
Also used : StringColumnStatsData(org.apache.hadoop.hive.metastore.api.StringColumnStatsData) NumDistinctValueEstimator(org.apache.hadoop.hive.metastore.NumDistinctValueEstimator)

Aggregations

NumDistinctValueEstimator (org.apache.hadoop.hive.metastore.NumDistinctValueEstimator)9 ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)4 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)4 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)4 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)4 HashMap (java.util.HashMap)3 DecimalColumnStatsData (org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData)2 DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)2 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)2 StringColumnStatsData (org.apache.hadoop.hive.metastore.api.StringColumnStatsData)2 ColumnStatisticsData._Fields (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData._Fields)1 Decimal (org.apache.hadoop.hive.metastore.api.Decimal)1