Search in sources :

Example 31 with DoubleColumnStatsData

use of org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData in project hive by apache.

the class DoubleColumnStatsAggregator method extrapolate.

@Override
public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, int numPartsWithStats, Map<String, Double> adjustedIndexMap, Map<String, ColumnStatisticsData> adjustedStatsMap, double densityAvg) {
    int rightBorderInd = numParts;
    DoubleColumnStatsDataInspector extrapolateDoubleData = new DoubleColumnStatsDataInspector();
    Map<String, DoubleColumnStatsData> extractedAdjustedStatsMap = new HashMap<>();
    for (Map.Entry<String, ColumnStatisticsData> entry : adjustedStatsMap.entrySet()) {
        extractedAdjustedStatsMap.put(entry.getKey(), entry.getValue().getDoubleStats());
    }
    List<Map.Entry<String, DoubleColumnStatsData>> list = new LinkedList<>(extractedAdjustedStatsMap.entrySet());
    // get the lowValue
    Collections.sort(list, new Comparator<Map.Entry<String, DoubleColumnStatsData>>() {

        @Override
        public int compare(Map.Entry<String, DoubleColumnStatsData> o1, Map.Entry<String, DoubleColumnStatsData> o2) {
            return Double.compare(o1.getValue().getLowValue(), o2.getValue().getLowValue());
        }
    });
    double minInd = adjustedIndexMap.get(list.get(0).getKey());
    double maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
    double lowValue = 0;
    double min = list.get(0).getValue().getLowValue();
    double max = list.get(list.size() - 1).getValue().getLowValue();
    if (minInd == maxInd) {
        lowValue = min;
    } else if (minInd < maxInd) {
        // left border is the min
        lowValue = (max - (max - min) * maxInd / (maxInd - minInd));
    } else {
        // right border is the min
        lowValue = (max - (max - min) * (rightBorderInd - maxInd) / (minInd - maxInd));
    }
    // get the highValue
    Collections.sort(list, new Comparator<Map.Entry<String, DoubleColumnStatsData>>() {

        @Override
        public int compare(Map.Entry<String, DoubleColumnStatsData> o1, Map.Entry<String, DoubleColumnStatsData> o2) {
            return Double.compare(o1.getValue().getHighValue(), o2.getValue().getHighValue());
        }
    });
    minInd = adjustedIndexMap.get(list.get(0).getKey());
    maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
    double highValue = 0;
    min = list.get(0).getValue().getHighValue();
    max = list.get(list.size() - 1).getValue().getHighValue();
    if (minInd == maxInd) {
        highValue = min;
    } else if (minInd < maxInd) {
        // right border is the max
        highValue = (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd));
    } else {
        // left border is the max
        highValue = (min + (max - min) * minInd / (minInd - maxInd));
    }
    // get the #nulls
    long numNulls = 0;
    for (Map.Entry<String, DoubleColumnStatsData> entry : extractedAdjustedStatsMap.entrySet()) {
        numNulls += entry.getValue().getNumNulls();
    }
    // we scale up sumNulls based on the number of partitions
    numNulls = numNulls * numParts / numPartsWithStats;
    // get the ndv
    long ndv = 0;
    long ndvMin = 0;
    long ndvMax = 0;
    Collections.sort(list, new Comparator<Map.Entry<String, DoubleColumnStatsData>>() {

        @Override
        public int compare(Map.Entry<String, DoubleColumnStatsData> o1, Map.Entry<String, DoubleColumnStatsData> o2) {
            return Long.compare(o1.getValue().getNumDVs(), o2.getValue().getNumDVs());
        }
    });
    long lowerBound = list.get(list.size() - 1).getValue().getNumDVs();
    long higherBound = 0;
    for (Map.Entry<String, DoubleColumnStatsData> entry : list) {
        higherBound += entry.getValue().getNumDVs();
    }
    if (useDensityFunctionForNDVEstimation && densityAvg != 0.0) {
        ndv = (long) ((highValue - lowValue) / densityAvg);
        if (ndv < lowerBound) {
            ndv = lowerBound;
        } else if (ndv > higherBound) {
            ndv = higherBound;
        }
    } else {
        minInd = adjustedIndexMap.get(list.get(0).getKey());
        maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
        ndvMin = list.get(0).getValue().getNumDVs();
        ndvMax = list.get(list.size() - 1).getValue().getNumDVs();
        if (minInd == maxInd) {
            ndv = ndvMin;
        } else if (minInd < maxInd) {
            // right border is the max
            ndv = (long) (ndvMin + (ndvMax - ndvMin) * (rightBorderInd - minInd) / (maxInd - minInd));
        } else {
            // left border is the max
            ndv = (long) (ndvMin + (ndvMax - ndvMin) * minInd / (minInd - maxInd));
        }
    }
    extrapolateDoubleData.setLowValue(lowValue);
    extrapolateDoubleData.setHighValue(highValue);
    extrapolateDoubleData.setNumNulls(numNulls);
    extrapolateDoubleData.setNumDVs(ndv);
    extrapolateData.setDoubleStats(extrapolateDoubleData);
}
Also used : HashMap(java.util.HashMap) LinkedList(java.util.LinkedList) DoubleColumnStatsData(org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData) DoubleColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector) HashMap(java.util.HashMap) Map(java.util.Map) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Example 32 with DoubleColumnStatsData

use of org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData in project hive by apache.

the class StatObjectConverter method convertToMTableColumnStatistics.

// JDO
public static MTableColumnStatistics convertToMTableColumnStatistics(MTable table, ColumnStatisticsDesc statsDesc, ColumnStatisticsObj statsObj) throws NoSuchObjectException, MetaException, InvalidObjectException {
    if (statsObj == null || statsDesc == null) {
        throw new InvalidObjectException("Invalid column stats object");
    }
    MTableColumnStatistics mColStats = new MTableColumnStatistics();
    mColStats.setTable(table);
    mColStats.setDbName(statsDesc.getDbName());
    mColStats.setTableName(statsDesc.getTableName());
    mColStats.setLastAnalyzed(statsDesc.getLastAnalyzed());
    mColStats.setColName(statsObj.getColName());
    mColStats.setColType(statsObj.getColType());
    if (statsObj.getStatsData().isSetBooleanStats()) {
        BooleanColumnStatsData boolStats = statsObj.getStatsData().getBooleanStats();
        mColStats.setBooleanStats(boolStats.isSetNumTrues() ? boolStats.getNumTrues() : null, boolStats.isSetNumFalses() ? boolStats.getNumFalses() : null, boolStats.isSetNumNulls() ? boolStats.getNumNulls() : null);
    } else if (statsObj.getStatsData().isSetLongStats()) {
        LongColumnStatsData longStats = statsObj.getStatsData().getLongStats();
        mColStats.setLongStats(longStats.isSetNumNulls() ? longStats.getNumNulls() : null, longStats.isSetNumDVs() ? longStats.getNumDVs() : null, longStats.isSetBitVectors() ? longStats.getBitVectors() : null, longStats.isSetLowValue() ? longStats.getLowValue() : null, longStats.isSetHighValue() ? longStats.getHighValue() : null);
    } else if (statsObj.getStatsData().isSetDoubleStats()) {
        DoubleColumnStatsData doubleStats = statsObj.getStatsData().getDoubleStats();
        mColStats.setDoubleStats(doubleStats.isSetNumNulls() ? doubleStats.getNumNulls() : null, doubleStats.isSetNumDVs() ? doubleStats.getNumDVs() : null, doubleStats.isSetBitVectors() ? doubleStats.getBitVectors() : null, doubleStats.isSetLowValue() ? doubleStats.getLowValue() : null, doubleStats.isSetHighValue() ? doubleStats.getHighValue() : null);
    } else if (statsObj.getStatsData().isSetDecimalStats()) {
        DecimalColumnStatsData decimalStats = statsObj.getStatsData().getDecimalStats();
        String low = decimalStats.isSetLowValue() ? createJdoDecimalString(decimalStats.getLowValue()) : null;
        String high = decimalStats.isSetHighValue() ? createJdoDecimalString(decimalStats.getHighValue()) : null;
        mColStats.setDecimalStats(decimalStats.isSetNumNulls() ? decimalStats.getNumNulls() : null, decimalStats.isSetNumDVs() ? decimalStats.getNumDVs() : null, decimalStats.isSetBitVectors() ? decimalStats.getBitVectors() : null, low, high);
    } else if (statsObj.getStatsData().isSetStringStats()) {
        StringColumnStatsData stringStats = statsObj.getStatsData().getStringStats();
        mColStats.setStringStats(stringStats.isSetNumNulls() ? stringStats.getNumNulls() : null, stringStats.isSetNumDVs() ? stringStats.getNumDVs() : null, stringStats.isSetBitVectors() ? stringStats.getBitVectors() : null, stringStats.isSetMaxColLen() ? stringStats.getMaxColLen() : null, stringStats.isSetAvgColLen() ? stringStats.getAvgColLen() : null);
    } else if (statsObj.getStatsData().isSetBinaryStats()) {
        BinaryColumnStatsData binaryStats = statsObj.getStatsData().getBinaryStats();
        mColStats.setBinaryStats(binaryStats.isSetNumNulls() ? binaryStats.getNumNulls() : null, binaryStats.isSetMaxColLen() ? binaryStats.getMaxColLen() : null, binaryStats.isSetAvgColLen() ? binaryStats.getAvgColLen() : null);
    } else if (statsObj.getStatsData().isSetDateStats()) {
        DateColumnStatsData dateStats = statsObj.getStatsData().getDateStats();
        mColStats.setDateStats(dateStats.isSetNumNulls() ? dateStats.getNumNulls() : null, dateStats.isSetNumDVs() ? dateStats.getNumDVs() : null, dateStats.isSetBitVectors() ? dateStats.getBitVectors() : null, dateStats.isSetLowValue() ? dateStats.getLowValue().getDaysSinceEpoch() : null, dateStats.isSetHighValue() ? dateStats.getHighValue().getDaysSinceEpoch() : null);
    }
    return mColStats;
}
Also used : BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) DoubleColumnStatsData(org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData) DecimalColumnStatsData(org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData) DateColumnStatsData(org.apache.hadoop.hive.metastore.api.DateColumnStatsData) MTableColumnStatistics(org.apache.hadoop.hive.metastore.model.MTableColumnStatistics) InvalidObjectException(org.apache.hadoop.hive.metastore.api.InvalidObjectException) StringColumnStatsData(org.apache.hadoop.hive.metastore.api.StringColumnStatsData) LongColumnStatsData(org.apache.hadoop.hive.metastore.api.LongColumnStatsData) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)

Aggregations

DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)32 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)27 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)22 StringColumnStatsData (org.apache.hadoop.hive.metastore.api.StringColumnStatsData)16 BinaryColumnStatsData (org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)15 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)15 DecimalColumnStatsData (org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData)15 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)15 ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)12 ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)11 Test (org.junit.Test)11 ArrayList (java.util.ArrayList)10 DateColumnStatsData (org.apache.hadoop.hive.metastore.api.DateColumnStatsData)10 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)6 SerDeInfo (org.apache.hadoop.hive.metastore.api.SerDeInfo)6 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)6 Table (org.apache.hadoop.hive.metastore.api.Table)6 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)5 Partition (org.apache.hadoop.hive.metastore.api.Partition)5 HashMap (java.util.HashMap)4