Search in sources :

Example 66 with ColumnStatisticsData._Fields

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData._Fields in project hive by apache.

the class StringColumnStatsAggregator method aggregate.

@Override
public ColumnStatisticsObj aggregate(String colName, List<String> partNames, List<ColumnStatistics> css) throws MetaException {
    ColumnStatisticsObj statsObj = null;
    // check if all the ColumnStatisticsObjs contain stats and all the ndv are
    // bitvectors. Only when both of the conditions are true, we merge bit
    // vectors. Otherwise, just use the maximum function.
    boolean doAllPartitionContainStats = partNames.size() == css.size();
    boolean isNDVBitVectorSet = true;
    String colType = null;
    for (ColumnStatistics cs : css) {
        if (cs.getStatsObjSize() != 1) {
            throw new MetaException("The number of columns should be exactly one in aggrStats, but found " + cs.getStatsObjSize());
        }
        ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
        if (statsObj == null) {
            colType = cso.getColType();
            statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso.getStatsData().getSetField());
        }
        if (numBitVectors <= 0 || !cso.getStatsData().getStringStats().isSetBitVectors() || cso.getStatsData().getStringStats().getBitVectors().length() == 0) {
            isNDVBitVectorSet = false;
            break;
        }
    }
    ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
    if (doAllPartitionContainStats && isNDVBitVectorSet) {
        StringColumnStatsData aggregateData = null;
        NumDistinctValueEstimator ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
        for (ColumnStatistics cs : css) {
            ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
            StringColumnStatsData newData = cso.getStatsData().getStringStats();
            ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), ndvEstimator.getnumBitVectors()));
            if (aggregateData == null) {
                aggregateData = newData.deepCopy();
            } else {
                aggregateData.setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen()));
                aggregateData.setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen()));
                aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
            }
        }
        aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
        columnStatisticsData.setStringStats(aggregateData);
    } else {
        StringColumnStatsData aggregateData = null;
        for (ColumnStatistics cs : css) {
            ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
            StringColumnStatsData newData = cso.getStatsData().getStringStats();
            if (aggregateData == null) {
                aggregateData = newData.deepCopy();
            } else {
                aggregateData.setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen()));
                aggregateData.setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen()));
                aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
            }
        }
        columnStatisticsData.setStringStats(aggregateData);
    }
    statsObj.setStatsData(columnStatisticsData);
    return statsObj;
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) StringColumnStatsData(org.apache.hadoop.hive.metastore.api.StringColumnStatsData) NumDistinctValueEstimator(org.apache.hadoop.hive.metastore.NumDistinctValueEstimator) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) MetaException(org.apache.hadoop.hive.metastore.api.MetaException)

Example 67 with ColumnStatisticsData._Fields

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData._Fields in project hive by apache.

the class ColumnStatsMergerFactory method getColumnStatsMerger.

public static ColumnStatsMerger getColumnStatsMerger(ColumnStatisticsObj statsObjNew, ColumnStatisticsObj statsObjOld) {
    ColumnStatsMerger agg;
    _Fields typeNew = statsObjNew.getStatsData().getSetField();
    _Fields typeOld = statsObjOld.getStatsData().getSetField();
    // make sure that they have the same type
    typeNew = typeNew == typeOld ? typeNew : null;
    int numBitVectors = 0;
    switch(typeNew) {
        case BOOLEAN_STATS:
            agg = new BooleanColumnStatsMerger();
            break;
        case LONG_STATS:
            {
                agg = new LongColumnStatsMerger();
                int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getLongStats().getBitVectors());
                int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getLongStats().getBitVectors());
                numBitVectors = nbvNew == nbvOld ? nbvNew : 0;
                break;
            }
        case DOUBLE_STATS:
            {
                agg = new DoubleColumnStatsMerger();
                int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getDoubleStats().getBitVectors());
                int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getDoubleStats().getBitVectors());
                numBitVectors = nbvNew == nbvOld ? nbvNew : 0;
                break;
            }
        case STRING_STATS:
            {
                agg = new StringColumnStatsMerger();
                int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getStringStats().getBitVectors());
                int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getStringStats().getBitVectors());
                numBitVectors = nbvNew == nbvOld ? nbvNew : 0;
                break;
            }
        case BINARY_STATS:
            agg = new BinaryColumnStatsMerger();
            break;
        case DECIMAL_STATS:
            {
                agg = new DecimalColumnStatsMerger();
                int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getDecimalStats().getBitVectors());
                int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getDecimalStats().getBitVectors());
                numBitVectors = nbvNew == nbvOld ? nbvNew : 0;
                break;
            }
        default:
            throw new RuntimeException("Woh, bad.  Unknown stats type " + typeNew.toString());
    }
    if (numBitVectors > 0) {
        agg.ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
    }
    return agg;
}
Also used : ColumnStatisticsData._Fields(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData._Fields) NumDistinctValueEstimator(org.apache.hadoop.hive.metastore.NumDistinctValueEstimator)

Example 68 with ColumnStatisticsData._Fields

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData._Fields in project hive by apache.

the class TestHBaseStoreCached method booleanTableStatistics.

// Due to the way our mock stuff works, we can only insert one column at a time, so we'll test
// each stat type separately.  We'll test them together in hte integration tests.
@Test
public void booleanTableStatistics() throws Exception {
    long now = System.currentTimeMillis();
    String dbname = "default";
    String tableName = "statstable";
    String boolcol = "boolcol";
    int startTime = (int) (System.currentTimeMillis() / 1000);
    List<FieldSchema> cols = new ArrayList<FieldSchema>();
    cols.add(new FieldSchema(boolcol, "boolean", "nocomment"));
    SerDeInfo serde = new SerDeInfo("serde", "seriallib", null);
    StorageDescriptor sd = new StorageDescriptor(cols, "file:/tmp", "input", "output", false, 0, serde, null, null, emptyParameters);
    Table table = new Table(tableName, dbname, "me", startTime, startTime, 0, sd, null, emptyParameters, null, null, null);
    store.createTable(table);
    long trues = 37;
    long falses = 12;
    long booleanNulls = 2;
    ColumnStatistics stats = new ColumnStatistics();
    ColumnStatisticsDesc desc = new ColumnStatisticsDesc();
    desc.setLastAnalyzed(now);
    desc.setDbName(dbname);
    desc.setTableName(tableName);
    desc.setIsTblLevel(true);
    stats.setStatsDesc(desc);
    ColumnStatisticsObj obj = new ColumnStatisticsObj();
    obj.setColName(boolcol);
    obj.setColType("boolean");
    ColumnStatisticsData data = new ColumnStatisticsData();
    BooleanColumnStatsData boolData = new BooleanColumnStatsData();
    boolData.setNumTrues(trues);
    boolData.setNumFalses(falses);
    boolData.setNumNulls(booleanNulls);
    data.setBooleanStats(boolData);
    obj.setStatsData(data);
    stats.addToStatsObj(obj);
    store.updateTableColumnStatistics(stats);
    stats = store.getTableColumnStatistics(dbname, tableName, Arrays.asList(boolcol));
    Assert.assertEquals(now, stats.getStatsDesc().getLastAnalyzed());
    Assert.assertEquals(dbname, stats.getStatsDesc().getDbName());
    Assert.assertEquals(tableName, stats.getStatsDesc().getTableName());
    Assert.assertTrue(stats.getStatsDesc().isIsTblLevel());
    Assert.assertEquals(1, stats.getStatsObjSize());
    ColumnStatisticsData colData = obj.getStatsData();
    Assert.assertEquals(ColumnStatisticsData._Fields.BOOLEAN_STATS, colData.getSetField());
    boolData = colData.getBooleanStats();
    Assert.assertEquals(trues, boolData.getNumTrues());
    Assert.assertEquals(falses, boolData.getNumFalses());
    Assert.assertEquals(booleanNulls, boolData.getNumNulls());
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) Table(org.apache.hadoop.hive.metastore.api.Table) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) SerDeInfo(org.apache.hadoop.hive.metastore.api.SerDeInfo) ArrayList(java.util.ArrayList) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) Test(org.junit.Test)

Example 69 with ColumnStatisticsData._Fields

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData._Fields in project hive by apache.

the class TestHBaseStoreBitVector method mockStringStats.

private static ColumnStatisticsObj mockStringStats(int i) {
    long maxLen = 1234 + 10 * i;
    double avgLen = 32.3 + i;
    long nulls = 987 + 10 * i;
    long dVs = 906 + i;
    String bitVectors = "{0, 1, 2, 3, 4, 5, 6, 7, 8}{0, 1, 3, 4, 5, 6, 7, 8}";
    ColumnStatisticsObj colStatsObj = new ColumnStatisticsObj();
    colStatsObj.setColName(STRING_COL);
    colStatsObj.setColType(STRING_TYPE);
    ColumnStatisticsData data = new ColumnStatisticsData();
    StringColumnStatsData stringData = new StringColumnStatsData();
    stringData.setMaxColLen(maxLen);
    stringData.setAvgColLen(avgLen);
    stringData.setNumNulls(nulls);
    stringData.setNumDVs(dVs);
    stringData.setBitVectors(bitVectors);
    data.setStringStats(stringData);
    colStatsObj.setStatsData(data);
    return colStatsObj;
}
Also used : ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) StringColumnStatsData(org.apache.hadoop.hive.metastore.api.StringColumnStatsData) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Example 70 with ColumnStatisticsData._Fields

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData._Fields in project hive by apache.

the class TestHBaseStoreBitVector method longPartitionStatistics.

@Test
public void longPartitionStatistics() throws Exception {
    createMockTableAndPartition(INT_TYPE, INT_VAL);
    // Add partition stats for: LONG_COL and partition: {PART_KEY, INT_VAL} to DB
    // Because of the way our mock implementation works we actually need to not create the table
    // before we set statistics on it.
    ColumnStatistics stats = new ColumnStatistics();
    // Get a default ColumnStatisticsDesc for partition level stats
    ColumnStatisticsDesc desc = getMockPartColStatsDesc(PART_KEY, INT_VAL);
    stats.setStatsDesc(desc);
    // Get one of the pre-created ColumnStatisticsObj
    ColumnStatisticsObj obj = longColStatsObjs.get(0);
    LongColumnStatsData longData = obj.getStatsData().getLongStats();
    // Add to DB
    stats.addToStatsObj(obj);
    List<String> parVals = new ArrayList<String>();
    parVals.add(INT_VAL);
    store.updatePartitionColumnStatistics(stats, parVals);
    // Get from DB
    List<String> partNames = new ArrayList<String>();
    partNames.add(desc.getPartName());
    List<String> colNames = new ArrayList<String>();
    colNames.add(obj.getColName());
    List<ColumnStatistics> statsFromDB = store.getPartitionColumnStatistics(DB, TBL, partNames, colNames);
    // Compare ColumnStatisticsDesc
    Assert.assertEquals(1, statsFromDB.size());
    Assert.assertEquals(desc.getLastAnalyzed(), statsFromDB.get(0).getStatsDesc().getLastAnalyzed());
    Assert.assertEquals(DB, statsFromDB.get(0).getStatsDesc().getDbName());
    Assert.assertEquals(TBL, statsFromDB.get(0).getStatsDesc().getTableName());
    Assert.assertFalse(statsFromDB.get(0).getStatsDesc().isIsTblLevel());
    // Compare ColumnStatisticsObj
    Assert.assertEquals(1, statsFromDB.get(0).getStatsObjSize());
    ColumnStatisticsObj objFromDB = statsFromDB.get(0).getStatsObj().get(0);
    ColumnStatisticsData dataFromDB = objFromDB.getStatsData();
    // Compare ColumnStatisticsData
    Assert.assertEquals(ColumnStatisticsData._Fields.LONG_STATS, dataFromDB.getSetField());
    // Compare LongColumnStatsData
    LongColumnStatsData longDataFromDB = dataFromDB.getLongStats();
    Assert.assertEquals(longData.getHighValue(), longDataFromDB.getHighValue());
    Assert.assertEquals(longData.getLowValue(), longDataFromDB.getLowValue());
    Assert.assertEquals(longData.getNumNulls(), longDataFromDB.getNumNulls());
    Assert.assertEquals(longData.getNumDVs(), longDataFromDB.getNumDVs());
    Assert.assertEquals(longData.getBitVectors(), longDataFromDB.getBitVectors());
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) ArrayList(java.util.ArrayList) LongColumnStatsData(org.apache.hadoop.hive.metastore.api.LongColumnStatsData) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) Test(org.junit.Test)

Aggregations

ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)108 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)95 ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)62 ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)56 Test (org.junit.Test)53 ArrayList (java.util.ArrayList)47 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)36 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)34 Table (org.apache.hadoop.hive.metastore.api.Table)33 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)32 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)31 SerDeInfo (org.apache.hadoop.hive.metastore.api.SerDeInfo)31 Partition (org.apache.hadoop.hive.metastore.api.Partition)30 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)29 DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)28 StringColumnStatsData (org.apache.hadoop.hive.metastore.api.StringColumnStatsData)26 DecimalColumnStatsData (org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData)24 BinaryColumnStatsData (org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)23 HashMap (java.util.HashMap)22 List (java.util.List)19