Search in sources :

Example 61 with ColumnStatisticsData

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData in project hive by apache.

the class LongColumnStatsAggregator method extrapolate.

@Override
public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, int numPartsWithStats, Map<String, Double> adjustedIndexMap, Map<String, ColumnStatisticsData> adjustedStatsMap, double densityAvg) {
    int rightBorderInd = numParts;
    LongColumnStatsData extrapolateLongData = new LongColumnStatsData();
    Map<String, LongColumnStatsData> extractedAdjustedStatsMap = new HashMap<>();
    for (Map.Entry<String, ColumnStatisticsData> entry : adjustedStatsMap.entrySet()) {
        extractedAdjustedStatsMap.put(entry.getKey(), entry.getValue().getLongStats());
    }
    List<Map.Entry<String, LongColumnStatsData>> list = new LinkedList<Map.Entry<String, LongColumnStatsData>>(extractedAdjustedStatsMap.entrySet());
    // get the lowValue
    Collections.sort(list, new Comparator<Map.Entry<String, LongColumnStatsData>>() {

        public int compare(Map.Entry<String, LongColumnStatsData> o1, Map.Entry<String, LongColumnStatsData> o2) {
            return o1.getValue().getLowValue() < o2.getValue().getLowValue() ? -1 : 1;
        }
    });
    double minInd = adjustedIndexMap.get(list.get(0).getKey());
    double maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
    long lowValue = 0;
    long min = list.get(0).getValue().getLowValue();
    long max = list.get(list.size() - 1).getValue().getLowValue();
    if (minInd == maxInd) {
        lowValue = min;
    } else if (minInd < maxInd) {
        // left border is the min
        lowValue = (long) (max - (max - min) * maxInd / (maxInd - minInd));
    } else {
        // right border is the min
        lowValue = (long) (max - (max - min) * (rightBorderInd - maxInd) / (minInd - maxInd));
    }
    // get the highValue
    Collections.sort(list, new Comparator<Map.Entry<String, LongColumnStatsData>>() {

        public int compare(Map.Entry<String, LongColumnStatsData> o1, Map.Entry<String, LongColumnStatsData> o2) {
            return o1.getValue().getHighValue() < o2.getValue().getHighValue() ? -1 : 1;
        }
    });
    minInd = adjustedIndexMap.get(list.get(0).getKey());
    maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
    long highValue = 0;
    min = list.get(0).getValue().getHighValue();
    max = list.get(list.size() - 1).getValue().getHighValue();
    if (minInd == maxInd) {
        highValue = min;
    } else if (minInd < maxInd) {
        // right border is the max
        highValue = (long) (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd));
    } else {
        // left border is the max
        highValue = (long) (min + (max - min) * minInd / (minInd - maxInd));
    }
    // get the #nulls
    long numNulls = 0;
    for (Map.Entry<String, LongColumnStatsData> entry : extractedAdjustedStatsMap.entrySet()) {
        numNulls += entry.getValue().getNumNulls();
    }
    // we scale up sumNulls based on the number of partitions
    numNulls = numNulls * numParts / numPartsWithStats;
    // get the ndv
    long ndv = 0;
    Collections.sort(list, new Comparator<Map.Entry<String, LongColumnStatsData>>() {

        public int compare(Map.Entry<String, LongColumnStatsData> o1, Map.Entry<String, LongColumnStatsData> o2) {
            return o1.getValue().getNumDVs() < o2.getValue().getNumDVs() ? -1 : 1;
        }
    });
    long lowerBound = list.get(list.size() - 1).getValue().getNumDVs();
    long higherBound = 0;
    for (Map.Entry<String, LongColumnStatsData> entry : list) {
        higherBound += entry.getValue().getNumDVs();
    }
    if (useDensityFunctionForNDVEstimation && densityAvg != 0.0) {
        ndv = (long) ((highValue - lowValue) / densityAvg);
        if (ndv < lowerBound) {
            ndv = lowerBound;
        } else if (ndv > higherBound) {
            ndv = higherBound;
        }
    } else {
        minInd = adjustedIndexMap.get(list.get(0).getKey());
        maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
        min = list.get(0).getValue().getNumDVs();
        max = list.get(list.size() - 1).getValue().getNumDVs();
        if (minInd == maxInd) {
            ndv = min;
        } else if (minInd < maxInd) {
            // right border is the max
            ndv = (long) (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd));
        } else {
            // left border is the max
            ndv = (long) (min + (max - min) * minInd / (minInd - maxInd));
        }
    }
    extrapolateLongData.setLowValue(lowValue);
    extrapolateLongData.setHighValue(highValue);
    extrapolateLongData.setNumNulls(numNulls);
    extrapolateLongData.setNumDVs(ndv);
    extrapolateData.setLongStats(extrapolateLongData);
}
Also used : HashMap(java.util.HashMap) LongColumnStatsData(org.apache.hadoop.hive.metastore.api.LongColumnStatsData) LinkedList(java.util.LinkedList) Map(java.util.Map) HashMap(java.util.HashMap) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Example 62 with ColumnStatisticsData

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData in project hive by apache.

the class StringColumnStatsAggregator method aggregate.

@Override
public ColumnStatisticsObj aggregate(String colName, List<String> partNames, List<ColumnStatistics> css) throws MetaException {
    ColumnStatisticsObj statsObj = null;
    // check if all the ColumnStatisticsObjs contain stats and all the ndv are
    // bitvectors. Only when both of the conditions are true, we merge bit
    // vectors. Otherwise, just use the maximum function.
    boolean doAllPartitionContainStats = partNames.size() == css.size();
    boolean isNDVBitVectorSet = true;
    String colType = null;
    for (ColumnStatistics cs : css) {
        if (cs.getStatsObjSize() != 1) {
            throw new MetaException("The number of columns should be exactly one in aggrStats, but found " + cs.getStatsObjSize());
        }
        ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
        if (statsObj == null) {
            colType = cso.getColType();
            statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso.getStatsData().getSetField());
        }
        if (numBitVectors <= 0 || !cso.getStatsData().getStringStats().isSetBitVectors() || cso.getStatsData().getStringStats().getBitVectors().length() == 0) {
            isNDVBitVectorSet = false;
            break;
        }
    }
    ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
    if (doAllPartitionContainStats && isNDVBitVectorSet) {
        StringColumnStatsData aggregateData = null;
        NumDistinctValueEstimator ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
        for (ColumnStatistics cs : css) {
            ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
            StringColumnStatsData newData = cso.getStatsData().getStringStats();
            ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), ndvEstimator.getnumBitVectors()));
            if (aggregateData == null) {
                aggregateData = newData.deepCopy();
            } else {
                aggregateData.setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen()));
                aggregateData.setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen()));
                aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
            }
        }
        aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
        columnStatisticsData.setStringStats(aggregateData);
    } else {
        StringColumnStatsData aggregateData = null;
        for (ColumnStatistics cs : css) {
            ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
            StringColumnStatsData newData = cso.getStatsData().getStringStats();
            if (aggregateData == null) {
                aggregateData = newData.deepCopy();
            } else {
                aggregateData.setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen()));
                aggregateData.setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen()));
                aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
            }
        }
        columnStatisticsData.setStringStats(aggregateData);
    }
    statsObj.setStatsData(columnStatisticsData);
    return statsObj;
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) StringColumnStatsData(org.apache.hadoop.hive.metastore.api.StringColumnStatsData) NumDistinctValueEstimator(org.apache.hadoop.hive.metastore.NumDistinctValueEstimator) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) MetaException(org.apache.hadoop.hive.metastore.api.MetaException)

Example 63 with ColumnStatisticsData

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData in project hive by apache.

the class TestHBaseStoreCached method booleanTableStatistics.

// Due to the way our mock stuff works, we can only insert one column at a time, so we'll test
// each stat type separately.  We'll test them together in hte integration tests.
@Test
public void booleanTableStatistics() throws Exception {
    long now = System.currentTimeMillis();
    String dbname = "default";
    String tableName = "statstable";
    String boolcol = "boolcol";
    int startTime = (int) (System.currentTimeMillis() / 1000);
    List<FieldSchema> cols = new ArrayList<FieldSchema>();
    cols.add(new FieldSchema(boolcol, "boolean", "nocomment"));
    SerDeInfo serde = new SerDeInfo("serde", "seriallib", null);
    StorageDescriptor sd = new StorageDescriptor(cols, "file:/tmp", "input", "output", false, 0, serde, null, null, emptyParameters);
    Table table = new Table(tableName, dbname, "me", startTime, startTime, 0, sd, null, emptyParameters, null, null, null);
    store.createTable(table);
    long trues = 37;
    long falses = 12;
    long booleanNulls = 2;
    ColumnStatistics stats = new ColumnStatistics();
    ColumnStatisticsDesc desc = new ColumnStatisticsDesc();
    desc.setLastAnalyzed(now);
    desc.setDbName(dbname);
    desc.setTableName(tableName);
    desc.setIsTblLevel(true);
    stats.setStatsDesc(desc);
    ColumnStatisticsObj obj = new ColumnStatisticsObj();
    obj.setColName(boolcol);
    obj.setColType("boolean");
    ColumnStatisticsData data = new ColumnStatisticsData();
    BooleanColumnStatsData boolData = new BooleanColumnStatsData();
    boolData.setNumTrues(trues);
    boolData.setNumFalses(falses);
    boolData.setNumNulls(booleanNulls);
    data.setBooleanStats(boolData);
    obj.setStatsData(data);
    stats.addToStatsObj(obj);
    store.updateTableColumnStatistics(stats);
    stats = store.getTableColumnStatistics(dbname, tableName, Arrays.asList(boolcol));
    Assert.assertEquals(now, stats.getStatsDesc().getLastAnalyzed());
    Assert.assertEquals(dbname, stats.getStatsDesc().getDbName());
    Assert.assertEquals(tableName, stats.getStatsDesc().getTableName());
    Assert.assertTrue(stats.getStatsDesc().isIsTblLevel());
    Assert.assertEquals(1, stats.getStatsObjSize());
    ColumnStatisticsData colData = obj.getStatsData();
    Assert.assertEquals(ColumnStatisticsData._Fields.BOOLEAN_STATS, colData.getSetField());
    boolData = colData.getBooleanStats();
    Assert.assertEquals(trues, boolData.getNumTrues());
    Assert.assertEquals(falses, boolData.getNumFalses());
    Assert.assertEquals(booleanNulls, boolData.getNumNulls());
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) Table(org.apache.hadoop.hive.metastore.api.Table) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) SerDeInfo(org.apache.hadoop.hive.metastore.api.SerDeInfo) ArrayList(java.util.ArrayList) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) Test(org.junit.Test)

Example 64 with ColumnStatisticsData

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData in project hive by apache.

the class TestHBaseStoreBitVector method mockStringStats.

private static ColumnStatisticsObj mockStringStats(int i) {
    long maxLen = 1234 + 10 * i;
    double avgLen = 32.3 + i;
    long nulls = 987 + 10 * i;
    long dVs = 906 + i;
    String bitVectors = "{0, 1, 2, 3, 4, 5, 6, 7, 8}{0, 1, 3, 4, 5, 6, 7, 8}";
    ColumnStatisticsObj colStatsObj = new ColumnStatisticsObj();
    colStatsObj.setColName(STRING_COL);
    colStatsObj.setColType(STRING_TYPE);
    ColumnStatisticsData data = new ColumnStatisticsData();
    StringColumnStatsData stringData = new StringColumnStatsData();
    stringData.setMaxColLen(maxLen);
    stringData.setAvgColLen(avgLen);
    stringData.setNumNulls(nulls);
    stringData.setNumDVs(dVs);
    stringData.setBitVectors(bitVectors);
    data.setStringStats(stringData);
    colStatsObj.setStatsData(data);
    return colStatsObj;
}
Also used : ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) StringColumnStatsData(org.apache.hadoop.hive.metastore.api.StringColumnStatsData) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Example 65 with ColumnStatisticsData

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData in project hive by apache.

the class TestHBaseStoreBitVector method longPartitionStatistics.

@Test
public void longPartitionStatistics() throws Exception {
    createMockTableAndPartition(INT_TYPE, INT_VAL);
    // Add partition stats for: LONG_COL and partition: {PART_KEY, INT_VAL} to DB
    // Because of the way our mock implementation works we actually need to not create the table
    // before we set statistics on it.
    ColumnStatistics stats = new ColumnStatistics();
    // Get a default ColumnStatisticsDesc for partition level stats
    ColumnStatisticsDesc desc = getMockPartColStatsDesc(PART_KEY, INT_VAL);
    stats.setStatsDesc(desc);
    // Get one of the pre-created ColumnStatisticsObj
    ColumnStatisticsObj obj = longColStatsObjs.get(0);
    LongColumnStatsData longData = obj.getStatsData().getLongStats();
    // Add to DB
    stats.addToStatsObj(obj);
    List<String> parVals = new ArrayList<String>();
    parVals.add(INT_VAL);
    store.updatePartitionColumnStatistics(stats, parVals);
    // Get from DB
    List<String> partNames = new ArrayList<String>();
    partNames.add(desc.getPartName());
    List<String> colNames = new ArrayList<String>();
    colNames.add(obj.getColName());
    List<ColumnStatistics> statsFromDB = store.getPartitionColumnStatistics(DB, TBL, partNames, colNames);
    // Compare ColumnStatisticsDesc
    Assert.assertEquals(1, statsFromDB.size());
    Assert.assertEquals(desc.getLastAnalyzed(), statsFromDB.get(0).getStatsDesc().getLastAnalyzed());
    Assert.assertEquals(DB, statsFromDB.get(0).getStatsDesc().getDbName());
    Assert.assertEquals(TBL, statsFromDB.get(0).getStatsDesc().getTableName());
    Assert.assertFalse(statsFromDB.get(0).getStatsDesc().isIsTblLevel());
    // Compare ColumnStatisticsObj
    Assert.assertEquals(1, statsFromDB.get(0).getStatsObjSize());
    ColumnStatisticsObj objFromDB = statsFromDB.get(0).getStatsObj().get(0);
    ColumnStatisticsData dataFromDB = objFromDB.getStatsData();
    // Compare ColumnStatisticsData
    Assert.assertEquals(ColumnStatisticsData._Fields.LONG_STATS, dataFromDB.getSetField());
    // Compare LongColumnStatsData
    LongColumnStatsData longDataFromDB = dataFromDB.getLongStats();
    Assert.assertEquals(longData.getHighValue(), longDataFromDB.getHighValue());
    Assert.assertEquals(longData.getLowValue(), longDataFromDB.getLowValue());
    Assert.assertEquals(longData.getNumNulls(), longDataFromDB.getNumNulls());
    Assert.assertEquals(longData.getNumDVs(), longDataFromDB.getNumDVs());
    Assert.assertEquals(longData.getBitVectors(), longDataFromDB.getBitVectors());
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) ArrayList(java.util.ArrayList) LongColumnStatsData(org.apache.hadoop.hive.metastore.api.LongColumnStatsData) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) Test(org.junit.Test)

Aggregations

ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)108 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)95 ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)62 ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)56 Test (org.junit.Test)53 ArrayList (java.util.ArrayList)47 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)35 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)34 Table (org.apache.hadoop.hive.metastore.api.Table)33 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)32 SerDeInfo (org.apache.hadoop.hive.metastore.api.SerDeInfo)31 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)30 Partition (org.apache.hadoop.hive.metastore.api.Partition)30 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)29 DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)27 StringColumnStatsData (org.apache.hadoop.hive.metastore.api.StringColumnStatsData)25 DecimalColumnStatsData (org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData)23 BinaryColumnStatsData (org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)22 HashMap (java.util.HashMap)20 List (java.util.List)18