Search in sources :

Example 26 with LongColumnStatsData

use of org.apache.hadoop.hive.metastore.api.LongColumnStatsData in project hive by apache.

the class HBaseUtils method protoBufStatsForOneColumn.

private static HbaseMetastoreProto.ColumnStats protoBufStatsForOneColumn(ColumnStatistics partitionColumnStats, ColumnStatisticsObj colStats) throws IOException {
    HbaseMetastoreProto.ColumnStats.Builder builder = HbaseMetastoreProto.ColumnStats.newBuilder();
    if (partitionColumnStats != null) {
        builder.setLastAnalyzed(partitionColumnStats.getStatsDesc().getLastAnalyzed());
    }
    assert colStats.getColType() != null;
    builder.setColumnType(colStats.getColType());
    assert colStats.getColName() != null;
    builder.setColumnName(colStats.getColName());
    ColumnStatisticsData colData = colStats.getStatsData();
    switch(colData.getSetField()) {
        case BOOLEAN_STATS:
            BooleanColumnStatsData boolData = colData.getBooleanStats();
            builder.setNumNulls(boolData.getNumNulls());
            builder.setBoolStats(HbaseMetastoreProto.ColumnStats.BooleanStats.newBuilder().setNumTrues(boolData.getNumTrues()).setNumFalses(boolData.getNumFalses()).build());
            break;
        case LONG_STATS:
            LongColumnStatsData longData = colData.getLongStats();
            builder.setNumNulls(longData.getNumNulls());
            builder.setNumDistinctValues(longData.getNumDVs());
            if (longData.isSetBitVectors()) {
                builder.setBitVectors(longData.getBitVectors());
            }
            builder.setLongStats(HbaseMetastoreProto.ColumnStats.LongStats.newBuilder().setLowValue(longData.getLowValue()).setHighValue(longData.getHighValue()).build());
            break;
        case DOUBLE_STATS:
            DoubleColumnStatsData doubleData = colData.getDoubleStats();
            builder.setNumNulls(doubleData.getNumNulls());
            builder.setNumDistinctValues(doubleData.getNumDVs());
            if (doubleData.isSetBitVectors()) {
                builder.setBitVectors(doubleData.getBitVectors());
            }
            builder.setDoubleStats(HbaseMetastoreProto.ColumnStats.DoubleStats.newBuilder().setLowValue(doubleData.getLowValue()).setHighValue(doubleData.getHighValue()).build());
            break;
        case STRING_STATS:
            StringColumnStatsData stringData = colData.getStringStats();
            builder.setNumNulls(stringData.getNumNulls());
            builder.setNumDistinctValues(stringData.getNumDVs());
            if (stringData.isSetBitVectors()) {
                builder.setBitVectors(stringData.getBitVectors());
            }
            builder.setStringStats(HbaseMetastoreProto.ColumnStats.StringStats.newBuilder().setMaxColLength(stringData.getMaxColLen()).setAvgColLength(stringData.getAvgColLen()).build());
            break;
        case BINARY_STATS:
            BinaryColumnStatsData binaryData = colData.getBinaryStats();
            builder.setNumNulls(binaryData.getNumNulls());
            builder.setBinaryStats(HbaseMetastoreProto.ColumnStats.StringStats.newBuilder().setMaxColLength(binaryData.getMaxColLen()).setAvgColLength(binaryData.getAvgColLen()).build());
            break;
        case DECIMAL_STATS:
            DecimalColumnStatsData decimalData = colData.getDecimalStats();
            builder.setNumNulls(decimalData.getNumNulls());
            builder.setNumDistinctValues(decimalData.getNumDVs());
            if (decimalData.isSetBitVectors()) {
                builder.setBitVectors(decimalData.getBitVectors());
            }
            if (decimalData.getLowValue() != null && decimalData.getHighValue() != null) {
                builder.setDecimalStats(HbaseMetastoreProto.ColumnStats.DecimalStats.newBuilder().setLowValue(HbaseMetastoreProto.ColumnStats.DecimalStats.Decimal.newBuilder().setUnscaled(ByteString.copyFrom(decimalData.getLowValue().getUnscaled())).setScale(decimalData.getLowValue().getScale()).build()).setHighValue(HbaseMetastoreProto.ColumnStats.DecimalStats.Decimal.newBuilder().setUnscaled(ByteString.copyFrom(decimalData.getHighValue().getUnscaled())).setScale(decimalData.getHighValue().getScale()).build())).build();
            } else {
                builder.setDecimalStats(HbaseMetastoreProto.ColumnStats.DecimalStats.newBuilder().clear().build());
            }
            break;
        default:
            throw new RuntimeException("Woh, bad.  Unknown stats type!");
    }
    return builder.build();
}
Also used : BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) DoubleColumnStatsData(org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData) DecimalColumnStatsData(org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData) StringColumnStatsData(org.apache.hadoop.hive.metastore.api.StringColumnStatsData) LongColumnStatsData(org.apache.hadoop.hive.metastore.api.LongColumnStatsData) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)

Example 27 with LongColumnStatsData

use of org.apache.hadoop.hive.metastore.api.LongColumnStatsData in project hive by apache.

the class LongColumnStatsAggregator method extrapolate.

@Override
public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, int numPartsWithStats, Map<String, Double> adjustedIndexMap, Map<String, ColumnStatisticsData> adjustedStatsMap, double densityAvg) {
    int rightBorderInd = numParts;
    LongColumnStatsData extrapolateLongData = new LongColumnStatsData();
    Map<String, LongColumnStatsData> extractedAdjustedStatsMap = new HashMap<>();
    for (Map.Entry<String, ColumnStatisticsData> entry : adjustedStatsMap.entrySet()) {
        extractedAdjustedStatsMap.put(entry.getKey(), entry.getValue().getLongStats());
    }
    List<Map.Entry<String, LongColumnStatsData>> list = new LinkedList<Map.Entry<String, LongColumnStatsData>>(extractedAdjustedStatsMap.entrySet());
    // get the lowValue
    Collections.sort(list, new Comparator<Map.Entry<String, LongColumnStatsData>>() {

        public int compare(Map.Entry<String, LongColumnStatsData> o1, Map.Entry<String, LongColumnStatsData> o2) {
            return o1.getValue().getLowValue() < o2.getValue().getLowValue() ? -1 : 1;
        }
    });
    double minInd = adjustedIndexMap.get(list.get(0).getKey());
    double maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
    long lowValue = 0;
    long min = list.get(0).getValue().getLowValue();
    long max = list.get(list.size() - 1).getValue().getLowValue();
    if (minInd == maxInd) {
        lowValue = min;
    } else if (minInd < maxInd) {
        // left border is the min
        lowValue = (long) (max - (max - min) * maxInd / (maxInd - minInd));
    } else {
        // right border is the min
        lowValue = (long) (max - (max - min) * (rightBorderInd - maxInd) / (minInd - maxInd));
    }
    // get the highValue
    Collections.sort(list, new Comparator<Map.Entry<String, LongColumnStatsData>>() {

        public int compare(Map.Entry<String, LongColumnStatsData> o1, Map.Entry<String, LongColumnStatsData> o2) {
            return o1.getValue().getHighValue() < o2.getValue().getHighValue() ? -1 : 1;
        }
    });
    minInd = adjustedIndexMap.get(list.get(0).getKey());
    maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
    long highValue = 0;
    min = list.get(0).getValue().getHighValue();
    max = list.get(list.size() - 1).getValue().getHighValue();
    if (minInd == maxInd) {
        highValue = min;
    } else if (minInd < maxInd) {
        // right border is the max
        highValue = (long) (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd));
    } else {
        // left border is the max
        highValue = (long) (min + (max - min) * minInd / (minInd - maxInd));
    }
    // get the #nulls
    long numNulls = 0;
    for (Map.Entry<String, LongColumnStatsData> entry : extractedAdjustedStatsMap.entrySet()) {
        numNulls += entry.getValue().getNumNulls();
    }
    // we scale up sumNulls based on the number of partitions
    numNulls = numNulls * numParts / numPartsWithStats;
    // get the ndv
    long ndv = 0;
    Collections.sort(list, new Comparator<Map.Entry<String, LongColumnStatsData>>() {

        public int compare(Map.Entry<String, LongColumnStatsData> o1, Map.Entry<String, LongColumnStatsData> o2) {
            return o1.getValue().getNumDVs() < o2.getValue().getNumDVs() ? -1 : 1;
        }
    });
    long lowerBound = list.get(list.size() - 1).getValue().getNumDVs();
    long higherBound = 0;
    for (Map.Entry<String, LongColumnStatsData> entry : list) {
        higherBound += entry.getValue().getNumDVs();
    }
    if (useDensityFunctionForNDVEstimation && densityAvg != 0.0) {
        ndv = (long) ((highValue - lowValue) / densityAvg);
        if (ndv < lowerBound) {
            ndv = lowerBound;
        } else if (ndv > higherBound) {
            ndv = higherBound;
        }
    } else {
        minInd = adjustedIndexMap.get(list.get(0).getKey());
        maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
        min = list.get(0).getValue().getNumDVs();
        max = list.get(list.size() - 1).getValue().getNumDVs();
        if (minInd == maxInd) {
            ndv = min;
        } else if (minInd < maxInd) {
            // right border is the max
            ndv = (long) (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd));
        } else {
            // left border is the max
            ndv = (long) (min + (max - min) * minInd / (minInd - maxInd));
        }
    }
    extrapolateLongData.setLowValue(lowValue);
    extrapolateLongData.setHighValue(highValue);
    extrapolateLongData.setNumNulls(numNulls);
    extrapolateLongData.setNumDVs(ndv);
    extrapolateData.setLongStats(extrapolateLongData);
}
Also used : HashMap(java.util.HashMap) LongColumnStatsData(org.apache.hadoop.hive.metastore.api.LongColumnStatsData) LinkedList(java.util.LinkedList) Map(java.util.Map) HashMap(java.util.HashMap) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Example 28 with LongColumnStatsData

use of org.apache.hadoop.hive.metastore.api.LongColumnStatsData in project hive by apache.

the class LongColumnStatsMerger method merge.

@Override
public void merge(ColumnStatisticsObj aggregateColStats, ColumnStatisticsObj newColStats) {
    LongColumnStatsData aggregateData = aggregateColStats.getStatsData().getLongStats();
    LongColumnStatsData newData = newColStats.getStatsData().getLongStats();
    aggregateData.setLowValue(Math.min(aggregateData.getLowValue(), newData.getLowValue()));
    aggregateData.setHighValue(Math.max(aggregateData.getHighValue(), newData.getHighValue()));
    aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
    if (ndvEstimator == null || !newData.isSetBitVectors() || newData.getBitVectors().length() == 0) {
        aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
    } else {
        ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(aggregateData.getBitVectors(), ndvEstimator.getnumBitVectors()));
        ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), ndvEstimator.getnumBitVectors()));
        long ndv = ndvEstimator.estimateNumDistinctValues();
        LOG.debug("Use bitvector to merge column " + aggregateColStats.getColName() + "'s ndvs of " + aggregateData.getNumDVs() + " and " + newData.getNumDVs() + " to be " + ndv);
        aggregateData.setNumDVs(ndv);
        aggregateData.setBitVectors(ndvEstimator.serialize().toString());
    }
}
Also used : LongColumnStatsData(org.apache.hadoop.hive.metastore.api.LongColumnStatsData) NumDistinctValueEstimator(org.apache.hadoop.hive.metastore.NumDistinctValueEstimator)

Example 29 with LongColumnStatsData

use of org.apache.hadoop.hive.metastore.api.LongColumnStatsData in project hive by apache.

the class TestHBaseStoreBitVector method longPartitionStatistics.

@Test
public void longPartitionStatistics() throws Exception {
    createMockTableAndPartition(INT_TYPE, INT_VAL);
    // Add partition stats for: LONG_COL and partition: {PART_KEY, INT_VAL} to DB
    // Because of the way our mock implementation works we actually need to not create the table
    // before we set statistics on it.
    ColumnStatistics stats = new ColumnStatistics();
    // Get a default ColumnStatisticsDesc for partition level stats
    ColumnStatisticsDesc desc = getMockPartColStatsDesc(PART_KEY, INT_VAL);
    stats.setStatsDesc(desc);
    // Get one of the pre-created ColumnStatisticsObj
    ColumnStatisticsObj obj = longColStatsObjs.get(0);
    LongColumnStatsData longData = obj.getStatsData().getLongStats();
    // Add to DB
    stats.addToStatsObj(obj);
    List<String> parVals = new ArrayList<String>();
    parVals.add(INT_VAL);
    store.updatePartitionColumnStatistics(stats, parVals);
    // Get from DB
    List<String> partNames = new ArrayList<String>();
    partNames.add(desc.getPartName());
    List<String> colNames = new ArrayList<String>();
    colNames.add(obj.getColName());
    List<ColumnStatistics> statsFromDB = store.getPartitionColumnStatistics(DB, TBL, partNames, colNames);
    // Compare ColumnStatisticsDesc
    Assert.assertEquals(1, statsFromDB.size());
    Assert.assertEquals(desc.getLastAnalyzed(), statsFromDB.get(0).getStatsDesc().getLastAnalyzed());
    Assert.assertEquals(DB, statsFromDB.get(0).getStatsDesc().getDbName());
    Assert.assertEquals(TBL, statsFromDB.get(0).getStatsDesc().getTableName());
    Assert.assertFalse(statsFromDB.get(0).getStatsDesc().isIsTblLevel());
    // Compare ColumnStatisticsObj
    Assert.assertEquals(1, statsFromDB.get(0).getStatsObjSize());
    ColumnStatisticsObj objFromDB = statsFromDB.get(0).getStatsObj().get(0);
    ColumnStatisticsData dataFromDB = objFromDB.getStatsData();
    // Compare ColumnStatisticsData
    Assert.assertEquals(ColumnStatisticsData._Fields.LONG_STATS, dataFromDB.getSetField());
    // Compare LongColumnStatsData
    LongColumnStatsData longDataFromDB = dataFromDB.getLongStats();
    Assert.assertEquals(longData.getHighValue(), longDataFromDB.getHighValue());
    Assert.assertEquals(longData.getLowValue(), longDataFromDB.getLowValue());
    Assert.assertEquals(longData.getNumNulls(), longDataFromDB.getNumNulls());
    Assert.assertEquals(longData.getNumDVs(), longDataFromDB.getNumDVs());
    Assert.assertEquals(longData.getBitVectors(), longDataFromDB.getBitVectors());
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) ArrayList(java.util.ArrayList) LongColumnStatsData(org.apache.hadoop.hive.metastore.api.LongColumnStatsData) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) Test(org.junit.Test)

Example 30 with LongColumnStatsData

use of org.apache.hadoop.hive.metastore.api.LongColumnStatsData in project hive by apache.

the class MetaDataFormatUtils method formatWithIndentation.

private static void formatWithIndentation(String colName, String colType, String colComment, StringBuilder tableInfo, List<ColumnStatisticsObj> colStats) {
    tableInfo.append(String.format("%-" + ALIGNMENT + "s", colName)).append(FIELD_DELIM);
    tableInfo.append(String.format("%-" + ALIGNMENT + "s", colType)).append(FIELD_DELIM);
    if (colStats != null) {
        ColumnStatisticsObj cso = getColumnStatisticsObject(colName, colType, colStats);
        if (cso != null) {
            ColumnStatisticsData csd = cso.getStatsData();
            if (csd.isSetBinaryStats()) {
                BinaryColumnStatsData bcsd = csd.getBinaryStats();
                appendColumnStats(tableInfo, "", "", bcsd.getNumNulls(), "", bcsd.getAvgColLen(), bcsd.getMaxColLen(), "", "");
            } else if (csd.isSetStringStats()) {
                StringColumnStatsData scsd = csd.getStringStats();
                appendColumnStats(tableInfo, "", "", scsd.getNumNulls(), scsd.getNumDVs(), scsd.getAvgColLen(), scsd.getMaxColLen(), "", "");
            } else if (csd.isSetBooleanStats()) {
                BooleanColumnStatsData bcsd = csd.getBooleanStats();
                appendColumnStats(tableInfo, "", "", bcsd.getNumNulls(), "", "", "", bcsd.getNumTrues(), bcsd.getNumFalses());
            } else if (csd.isSetDecimalStats()) {
                DecimalColumnStatsData dcsd = csd.getDecimalStats();
                appendColumnStats(tableInfo, convertToString(dcsd.getLowValue()), convertToString(dcsd.getHighValue()), dcsd.getNumNulls(), dcsd.getNumDVs(), "", "", "", "");
            } else if (csd.isSetDoubleStats()) {
                DoubleColumnStatsData dcsd = csd.getDoubleStats();
                appendColumnStats(tableInfo, dcsd.getLowValue(), dcsd.getHighValue(), dcsd.getNumNulls(), dcsd.getNumDVs(), "", "", "", "");
            } else if (csd.isSetLongStats()) {
                LongColumnStatsData lcsd = csd.getLongStats();
                appendColumnStats(tableInfo, lcsd.getLowValue(), lcsd.getHighValue(), lcsd.getNumNulls(), lcsd.getNumDVs(), "", "", "", "");
            } else if (csd.isSetDateStats()) {
                DateColumnStatsData dcsd = csd.getDateStats();
                appendColumnStats(tableInfo, convertToString(dcsd.getLowValue()), convertToString(dcsd.getHighValue()), dcsd.getNumNulls(), dcsd.getNumDVs(), "", "", "", "");
            }
        } else {
            appendColumnStats(tableInfo, "", "", "", "", "", "", "", "");
        }
    }
    int colNameLength = ALIGNMENT > colName.length() ? ALIGNMENT : colName.length();
    int colTypeLength = ALIGNMENT > colType.length() ? ALIGNMENT : colType.length();
    indentMultilineValue(colComment, tableInfo, new int[] { colNameLength, colTypeLength }, false);
}
Also used : BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) DecimalColumnStatsData(org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData) DoubleColumnStatsData(org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData) DateColumnStatsData(org.apache.hadoop.hive.metastore.api.DateColumnStatsData) StringColumnStatsData(org.apache.hadoop.hive.metastore.api.StringColumnStatsData) LongColumnStatsData(org.apache.hadoop.hive.metastore.api.LongColumnStatsData) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)

Aggregations

LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)54 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)39 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)35 StringColumnStatsData (org.apache.hadoop.hive.metastore.api.StringColumnStatsData)23 BinaryColumnStatsData (org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)22 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)22 DecimalColumnStatsData (org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData)22 DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)22 ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)20 ArrayList (java.util.ArrayList)19 ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)19 Test (org.junit.Test)19 DateColumnStatsData (org.apache.hadoop.hive.metastore.api.DateColumnStatsData)15 Table (org.apache.hadoop.hive.metastore.api.Table)15 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)14 Partition (org.apache.hadoop.hive.metastore.api.Partition)14 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)14 SerDeInfo (org.apache.hadoop.hive.metastore.api.SerDeInfo)13 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)12 List (java.util.List)11