Search in sources :

Example 36 with DoubleColumnStatsData

use of org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData in project hive by apache.

the class MetaDataFormatUtils method extractColumnValues.

public static String[] extractColumnValues(FieldSchema col, boolean isColStatsAvailable, ColumnStatisticsObj columnStatisticsObj) {
    List<String> ret = new ArrayList<>();
    ret.add(col.getName());
    ret.add(col.getType());
    if (isColStatsAvailable) {
        if (columnStatisticsObj != null) {
            ColumnStatisticsData csd = columnStatisticsObj.getStatsData();
            // @formatter:off
            if (csd.isSetBinaryStats()) {
                BinaryColumnStatsData bcsd = csd.getBinaryStats();
                ret.addAll(Lists.newArrayList("", "", "" + bcsd.getNumNulls(), "", "" + bcsd.getAvgColLen(), "" + bcsd.getMaxColLen(), "", "", convertToString(bcsd.getBitVectors())));
            } else if (csd.isSetStringStats()) {
                StringColumnStatsData scsd = csd.getStringStats();
                ret.addAll(Lists.newArrayList("", "", "" + scsd.getNumNulls(), "" + scsd.getNumDVs(), "" + scsd.getAvgColLen(), "" + scsd.getMaxColLen(), "", "", convertToString(scsd.getBitVectors())));
            } else if (csd.isSetBooleanStats()) {
                BooleanColumnStatsData bcsd = csd.getBooleanStats();
                ret.addAll(Lists.newArrayList("", "", "" + bcsd.getNumNulls(), "", "", "", "" + bcsd.getNumTrues(), "" + bcsd.getNumFalses(), convertToString(bcsd.getBitVectors())));
            } else if (csd.isSetDecimalStats()) {
                DecimalColumnStatsData dcsd = csd.getDecimalStats();
                ret.addAll(Lists.newArrayList(convertToString(dcsd.getLowValue()), convertToString(dcsd.getHighValue()), "" + dcsd.getNumNulls(), "" + dcsd.getNumDVs(), "", "", "", "", convertToString(dcsd.getBitVectors())));
            } else if (csd.isSetDoubleStats()) {
                DoubleColumnStatsData dcsd = csd.getDoubleStats();
                ret.addAll(Lists.newArrayList("" + dcsd.getLowValue(), "" + dcsd.getHighValue(), "" + dcsd.getNumNulls(), "" + dcsd.getNumDVs(), "", "", "", "", convertToString(dcsd.getBitVectors())));
            } else if (csd.isSetLongStats()) {
                LongColumnStatsData lcsd = csd.getLongStats();
                ret.addAll(Lists.newArrayList("" + lcsd.getLowValue(), "" + lcsd.getHighValue(), "" + lcsd.getNumNulls(), "" + lcsd.getNumDVs(), "", "", "", "", convertToString(lcsd.getBitVectors())));
            } else if (csd.isSetDateStats()) {
                DateColumnStatsData dcsd = csd.getDateStats();
                ret.addAll(Lists.newArrayList(convertToString(dcsd.getLowValue()), convertToString(dcsd.getHighValue()), "" + dcsd.getNumNulls(), "" + dcsd.getNumDVs(), "", "", "", "", convertToString(dcsd.getBitVectors())));
            }
        // @formatter:on
        } else {
            ret.addAll(Lists.newArrayList("", "", "", "", "", "", "", "", ""));
        }
    }
    ret.add(getComment(col));
    return ret.toArray(new String[] {});
}
Also used : BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) DecimalColumnStatsData(org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData) DoubleColumnStatsData(org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData) DateColumnStatsData(org.apache.hadoop.hive.metastore.api.DateColumnStatsData) ArrayList(java.util.ArrayList) StringColumnStatsData(org.apache.hadoop.hive.metastore.api.StringColumnStatsData) LongColumnStatsData(org.apache.hadoop.hive.metastore.api.LongColumnStatsData) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)

Example 37 with DoubleColumnStatsData

use of org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData in project hive by apache.

the class StatObjectConverter method convertToMTableColumnStatistics.

// JDO
public static MTableColumnStatistics convertToMTableColumnStatistics(MTable table, ColumnStatisticsDesc statsDesc, ColumnStatisticsObj statsObj) throws NoSuchObjectException, MetaException, InvalidObjectException {
    if (statsObj == null || statsDesc == null) {
        throw new InvalidObjectException("Invalid column stats object");
    }
    MTableColumnStatistics mColStats = new MTableColumnStatistics();
    mColStats.setTable(table);
    mColStats.setDbName(statsDesc.getDbName());
    mColStats.setTableName(statsDesc.getTableName());
    mColStats.setLastAnalyzed(statsDesc.getLastAnalyzed());
    mColStats.setColName(statsObj.getColName());
    mColStats.setColType(statsObj.getColType());
    if (statsObj.getStatsData().isSetBooleanStats()) {
        BooleanColumnStatsData boolStats = statsObj.getStatsData().getBooleanStats();
        mColStats.setBooleanStats(boolStats.isSetNumTrues() ? boolStats.getNumTrues() : null, boolStats.isSetNumFalses() ? boolStats.getNumFalses() : null, boolStats.isSetNumNulls() ? boolStats.getNumNulls() : null);
    } else if (statsObj.getStatsData().isSetLongStats()) {
        LongColumnStatsData longStats = statsObj.getStatsData().getLongStats();
        mColStats.setLongStats(longStats.isSetNumNulls() ? longStats.getNumNulls() : null, longStats.isSetNumDVs() ? longStats.getNumDVs() : null, longStats.isSetBitVectors() ? longStats.getBitVectors() : null, longStats.isSetLowValue() ? longStats.getLowValue() : null, longStats.isSetHighValue() ? longStats.getHighValue() : null);
    } else if (statsObj.getStatsData().isSetDoubleStats()) {
        DoubleColumnStatsData doubleStats = statsObj.getStatsData().getDoubleStats();
        mColStats.setDoubleStats(doubleStats.isSetNumNulls() ? doubleStats.getNumNulls() : null, doubleStats.isSetNumDVs() ? doubleStats.getNumDVs() : null, doubleStats.isSetBitVectors() ? doubleStats.getBitVectors() : null, doubleStats.isSetLowValue() ? doubleStats.getLowValue() : null, doubleStats.isSetHighValue() ? doubleStats.getHighValue() : null);
    } else if (statsObj.getStatsData().isSetDecimalStats()) {
        DecimalColumnStatsData decimalStats = statsObj.getStatsData().getDecimalStats();
        String low = decimalStats.isSetLowValue() ? createJdoDecimalString(decimalStats.getLowValue()) : null;
        String high = decimalStats.isSetHighValue() ? createJdoDecimalString(decimalStats.getHighValue()) : null;
        mColStats.setDecimalStats(decimalStats.isSetNumNulls() ? decimalStats.getNumNulls() : null, decimalStats.isSetNumDVs() ? decimalStats.getNumDVs() : null, decimalStats.isSetBitVectors() ? decimalStats.getBitVectors() : null, low, high);
    } else if (statsObj.getStatsData().isSetStringStats()) {
        StringColumnStatsData stringStats = statsObj.getStatsData().getStringStats();
        mColStats.setStringStats(stringStats.isSetNumNulls() ? stringStats.getNumNulls() : null, stringStats.isSetNumDVs() ? stringStats.getNumDVs() : null, stringStats.isSetBitVectors() ? stringStats.getBitVectors() : null, stringStats.isSetMaxColLen() ? stringStats.getMaxColLen() : null, stringStats.isSetAvgColLen() ? stringStats.getAvgColLen() : null);
    } else if (statsObj.getStatsData().isSetBinaryStats()) {
        BinaryColumnStatsData binaryStats = statsObj.getStatsData().getBinaryStats();
        mColStats.setBinaryStats(binaryStats.isSetNumNulls() ? binaryStats.getNumNulls() : null, binaryStats.isSetMaxColLen() ? binaryStats.getMaxColLen() : null, binaryStats.isSetAvgColLen() ? binaryStats.getAvgColLen() : null);
    } else if (statsObj.getStatsData().isSetDateStats()) {
        DateColumnStatsData dateStats = statsObj.getStatsData().getDateStats();
        mColStats.setDateStats(dateStats.isSetNumNulls() ? dateStats.getNumNulls() : null, dateStats.isSetNumDVs() ? dateStats.getNumDVs() : null, dateStats.isSetBitVectors() ? dateStats.getBitVectors() : null, dateStats.isSetLowValue() ? dateStats.getLowValue().getDaysSinceEpoch() : null, dateStats.isSetHighValue() ? dateStats.getHighValue().getDaysSinceEpoch() : null);
    }
    return mColStats;
}
Also used : BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) DoubleColumnStatsData(org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData) DecimalColumnStatsData(org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData) DateColumnStatsData(org.apache.hadoop.hive.metastore.api.DateColumnStatsData) MTableColumnStatistics(org.apache.hadoop.hive.metastore.model.MTableColumnStatistics) InvalidObjectException(org.apache.hadoop.hive.metastore.api.InvalidObjectException) StringColumnStatsData(org.apache.hadoop.hive.metastore.api.StringColumnStatsData) LongColumnStatsData(org.apache.hadoop.hive.metastore.api.LongColumnStatsData) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)

Example 38 with DoubleColumnStatsData

use of org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData in project hive by apache.

the class ShowUtils method extractColumnValues.

public static String[] extractColumnValues(FieldSchema column, boolean isColumnStatsAvailable, ColumnStatisticsObj columnStatisticsObj) {
    List<String> values = new ArrayList<>();
    values.add(column.getName());
    values.add(column.getType());
    if (isColumnStatsAvailable) {
        if (columnStatisticsObj != null) {
            ColumnStatisticsData statsData = columnStatisticsObj.getStatsData();
            if (statsData.isSetBinaryStats()) {
                BinaryColumnStatsData binaryStats = statsData.getBinaryStats();
                values.addAll(Lists.newArrayList("", "", "" + binaryStats.getNumNulls(), "", "" + binaryStats.getAvgColLen(), "" + binaryStats.getMaxColLen(), "", "", convertToString(binaryStats.getBitVectors())));
            } else if (statsData.isSetStringStats()) {
                StringColumnStatsData stringStats = statsData.getStringStats();
                values.addAll(Lists.newArrayList("", "", "" + stringStats.getNumNulls(), "" + stringStats.getNumDVs(), "" + stringStats.getAvgColLen(), "" + stringStats.getMaxColLen(), "", "", convertToString(stringStats.getBitVectors())));
            } else if (statsData.isSetBooleanStats()) {
                BooleanColumnStatsData booleanStats = statsData.getBooleanStats();
                values.addAll(Lists.newArrayList("", "", "" + booleanStats.getNumNulls(), "", "", "", "" + booleanStats.getNumTrues(), "" + booleanStats.getNumFalses(), convertToString(booleanStats.getBitVectors())));
            } else if (statsData.isSetDecimalStats()) {
                DecimalColumnStatsData decimalStats = statsData.getDecimalStats();
                values.addAll(Lists.newArrayList(convertToString(decimalStats.getLowValue()), convertToString(decimalStats.getHighValue()), "" + decimalStats.getNumNulls(), "" + decimalStats.getNumDVs(), "", "", "", "", convertToString(decimalStats.getBitVectors())));
            } else if (statsData.isSetDoubleStats()) {
                DoubleColumnStatsData doubleStats = statsData.getDoubleStats();
                values.addAll(Lists.newArrayList("" + doubleStats.getLowValue(), "" + doubleStats.getHighValue(), "" + doubleStats.getNumNulls(), "" + doubleStats.getNumDVs(), "", "", "", "", convertToString(doubleStats.getBitVectors())));
            } else if (statsData.isSetLongStats()) {
                LongColumnStatsData longStats = statsData.getLongStats();
                values.addAll(Lists.newArrayList("" + longStats.getLowValue(), "" + longStats.getHighValue(), "" + longStats.getNumNulls(), "" + longStats.getNumDVs(), "", "", "", "", convertToString(longStats.getBitVectors())));
            } else if (statsData.isSetDateStats()) {
                DateColumnStatsData dateStats = statsData.getDateStats();
                values.addAll(Lists.newArrayList(convertToString(dateStats.getLowValue()), convertToString(dateStats.getHighValue()), "" + dateStats.getNumNulls(), "" + dateStats.getNumDVs(), "", "", "", "", convertToString(dateStats.getBitVectors())));
            } else if (statsData.isSetTimestampStats()) {
                TimestampColumnStatsData timestampStats = statsData.getTimestampStats();
                values.addAll(Lists.newArrayList(convertToString(timestampStats.getLowValue()), convertToString(timestampStats.getHighValue()), "" + timestampStats.getNumNulls(), "" + timestampStats.getNumDVs(), "", "", "", "", convertToString(timestampStats.getBitVectors())));
            }
        } else {
            values.addAll(Lists.newArrayList("", "", "", "", "", "", "", "", ""));
        }
    }
    values.add(column.getComment() != null ? column.getComment() : "");
    return values.toArray(new String[0]);
}
Also used : BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) DecimalColumnStatsData(org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData) DoubleColumnStatsData(org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData) DateColumnStatsData(org.apache.hadoop.hive.metastore.api.DateColumnStatsData) ArrayList(java.util.ArrayList) StringColumnStatsData(org.apache.hadoop.hive.metastore.api.StringColumnStatsData) LongColumnStatsData(org.apache.hadoop.hive.metastore.api.LongColumnStatsData) TimestampColumnStatsData(org.apache.hadoop.hive.metastore.api.TimestampColumnStatsData) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)

Example 39 with DoubleColumnStatsData

use of org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData in project hive by apache.

the class DoubleColumnStatsAggregator method extrapolate.

@Override
public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, int numPartsWithStats, Map<String, Double> adjustedIndexMap, Map<String, ColumnStatisticsData> adjustedStatsMap, double densityAvg) {
    int rightBorderInd = numParts;
    DoubleColumnStatsDataInspector extrapolateDoubleData = new DoubleColumnStatsDataInspector();
    Map<String, DoubleColumnStatsData> extractedAdjustedStatsMap = new HashMap<>();
    for (Map.Entry<String, ColumnStatisticsData> entry : adjustedStatsMap.entrySet()) {
        extractedAdjustedStatsMap.put(entry.getKey(), entry.getValue().getDoubleStats());
    }
    List<Map.Entry<String, DoubleColumnStatsData>> list = new LinkedList<>(extractedAdjustedStatsMap.entrySet());
    // get the lowValue
    Collections.sort(list, new Comparator<Map.Entry<String, DoubleColumnStatsData>>() {

        @Override
        public int compare(Map.Entry<String, DoubleColumnStatsData> o1, Map.Entry<String, DoubleColumnStatsData> o2) {
            return Double.compare(o1.getValue().getLowValue(), o2.getValue().getLowValue());
        }
    });
    double minInd = adjustedIndexMap.get(list.get(0).getKey());
    double maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
    double lowValue = 0;
    double min = list.get(0).getValue().getLowValue();
    double max = list.get(list.size() - 1).getValue().getLowValue();
    if (minInd == maxInd) {
        lowValue = min;
    } else if (minInd < maxInd) {
        // left border is the min
        lowValue = (max - (max - min) * maxInd / (maxInd - minInd));
    } else {
        // right border is the min
        lowValue = (max - (max - min) * (rightBorderInd - maxInd) / (minInd - maxInd));
    }
    // get the highValue
    Collections.sort(list, new Comparator<Map.Entry<String, DoubleColumnStatsData>>() {

        @Override
        public int compare(Map.Entry<String, DoubleColumnStatsData> o1, Map.Entry<String, DoubleColumnStatsData> o2) {
            return Double.compare(o1.getValue().getHighValue(), o2.getValue().getHighValue());
        }
    });
    minInd = adjustedIndexMap.get(list.get(0).getKey());
    maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
    double highValue = 0;
    min = list.get(0).getValue().getHighValue();
    max = list.get(list.size() - 1).getValue().getHighValue();
    if (minInd == maxInd) {
        highValue = min;
    } else if (minInd < maxInd) {
        // right border is the max
        highValue = (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd));
    } else {
        // left border is the max
        highValue = (min + (max - min) * minInd / (minInd - maxInd));
    }
    // get the #nulls
    long numNulls = 0;
    for (Map.Entry<String, DoubleColumnStatsData> entry : extractedAdjustedStatsMap.entrySet()) {
        numNulls += entry.getValue().getNumNulls();
    }
    // we scale up sumNulls based on the number of partitions
    numNulls = numNulls * numParts / numPartsWithStats;
    // get the ndv
    long ndv = 0;
    long ndvMin = 0;
    long ndvMax = 0;
    Collections.sort(list, new Comparator<Map.Entry<String, DoubleColumnStatsData>>() {

        @Override
        public int compare(Map.Entry<String, DoubleColumnStatsData> o1, Map.Entry<String, DoubleColumnStatsData> o2) {
            return Long.compare(o1.getValue().getNumDVs(), o2.getValue().getNumDVs());
        }
    });
    long lowerBound = list.get(list.size() - 1).getValue().getNumDVs();
    long higherBound = 0;
    for (Map.Entry<String, DoubleColumnStatsData> entry : list) {
        higherBound += entry.getValue().getNumDVs();
    }
    if (useDensityFunctionForNDVEstimation && densityAvg != 0.0) {
        ndv = (long) ((highValue - lowValue) / densityAvg);
        if (ndv < lowerBound) {
            ndv = lowerBound;
        } else if (ndv > higherBound) {
            ndv = higherBound;
        }
    } else {
        minInd = adjustedIndexMap.get(list.get(0).getKey());
        maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
        ndvMin = list.get(0).getValue().getNumDVs();
        ndvMax = list.get(list.size() - 1).getValue().getNumDVs();
        if (minInd == maxInd) {
            ndv = ndvMin;
        } else if (minInd < maxInd) {
            // right border is the max
            ndv = (long) (ndvMin + (ndvMax - ndvMin) * (rightBorderInd - minInd) / (maxInd - minInd));
        } else {
            // left border is the max
            ndv = (long) (ndvMin + (ndvMax - ndvMin) * minInd / (minInd - maxInd));
        }
    }
    extrapolateDoubleData.setLowValue(lowValue);
    extrapolateDoubleData.setHighValue(highValue);
    extrapolateDoubleData.setNumNulls(numNulls);
    extrapolateDoubleData.setNumDVs(ndv);
    extrapolateData.setDoubleStats(extrapolateDoubleData);
}
Also used : HashMap(java.util.HashMap) LinkedList(java.util.LinkedList) DoubleColumnStatsData(org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData) DoubleColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector) HashMap(java.util.HashMap) Map(java.util.Map) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Example 40 with DoubleColumnStatsData

use of org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData in project hive by apache.

the class DoubleColumnStatsAggregator method aggregate.

@Override
public ColumnStatisticsObj aggregate(List<ColStatsObjWithSourceInfo> colStatsWithSourceInfo, List<String> partNames, boolean areAllPartsFound) throws MetaException {
    ColumnStatisticsObj statsObj = null;
    String colType = null;
    String colName = null;
    // check if all the ColumnStatisticsObjs contain stats and all the ndv are
    // bitvectors
    boolean doAllPartitionContainStats = partNames.size() == colStatsWithSourceInfo.size();
    NumDistinctValueEstimator ndvEstimator = null;
    for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
        ColumnStatisticsObj cso = csp.getColStatsObj();
        if (statsObj == null) {
            colName = cso.getColName();
            colType = cso.getColType();
            statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso.getStatsData().getSetField());
            LOG.trace("doAllPartitionContainStats for column: {} is: {}", colName, doAllPartitionContainStats);
        }
        DoubleColumnStatsDataInspector doubleColumnStatsData = doubleInspectorFromStats(cso);
        if (doubleColumnStatsData.getNdvEstimator() == null) {
            ndvEstimator = null;
            break;
        } else {
            // check if all of the bit vectors can merge
            NumDistinctValueEstimator estimator = doubleColumnStatsData.getNdvEstimator();
            if (ndvEstimator == null) {
                ndvEstimator = estimator;
            } else {
                if (ndvEstimator.canMerge(estimator)) {
                    continue;
                } else {
                    ndvEstimator = null;
                    break;
                }
            }
        }
    }
    if (ndvEstimator != null) {
        ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator);
    }
    LOG.debug("all of the bit vectors can merge for " + colName + " is " + (ndvEstimator != null));
    ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
    if (doAllPartitionContainStats || colStatsWithSourceInfo.size() < 2) {
        DoubleColumnStatsDataInspector aggregateData = null;
        long lowerBound = 0;
        long higherBound = 0;
        double densityAvgSum = 0.0;
        for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
            ColumnStatisticsObj cso = csp.getColStatsObj();
            DoubleColumnStatsDataInspector newData = doubleInspectorFromStats(cso);
            lowerBound = Math.max(lowerBound, newData.getNumDVs());
            higherBound += newData.getNumDVs();
            densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs();
            if (ndvEstimator != null) {
                ndvEstimator.mergeEstimators(newData.getNdvEstimator());
            }
            if (aggregateData == null) {
                aggregateData = newData.deepCopy();
            } else {
                DoubleColumnStatsMerger merger = new DoubleColumnStatsMerger();
                merger.setLowValue(aggregateData, newData);
                merger.setHighValue(aggregateData, newData);
                aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
            }
        }
        if (ndvEstimator != null) {
            // if all the ColumnStatisticsObjs contain bitvectors, we do not need to
            // use uniform distribution assumption because we can merge bitvectors
            // to get a good estimation.
            aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
        } else {
            long estimation;
            if (useDensityFunctionForNDVEstimation) {
                // We have estimation, lowerbound and higherbound. We use estimation
                // if it is between lowerbound and higherbound.
                double densityAvg = densityAvgSum / partNames.size();
                estimation = (long) ((aggregateData.getHighValue() - aggregateData.getLowValue()) / densityAvg);
                if (estimation < lowerBound) {
                    estimation = lowerBound;
                } else if (estimation > higherBound) {
                    estimation = higherBound;
                }
            } else {
                estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner);
            }
            aggregateData.setNumDVs(estimation);
        }
        columnStatisticsData.setDoubleStats(aggregateData);
    } else {
        // we need extrapolation
        LOG.debug("start extrapolation for " + colName);
        Map<String, Integer> indexMap = new HashMap<>();
        for (int index = 0; index < partNames.size(); index++) {
            indexMap.put(partNames.get(index), index);
        }
        Map<String, Double> adjustedIndexMap = new HashMap<>();
        Map<String, ColumnStatisticsData> adjustedStatsMap = new HashMap<>();
        // while we scan the css, we also get the densityAvg, lowerbound and
        // higerbound when useDensityFunctionForNDVEstimation is true.
        double densityAvgSum = 0.0;
        if (ndvEstimator == null) {
            // the traditional extrapolation methods.
            for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
                ColumnStatisticsObj cso = csp.getColStatsObj();
                String partName = csp.getPartName();
                DoubleColumnStatsData newData = cso.getStatsData().getDoubleStats();
                if (useDensityFunctionForNDVEstimation && newData.isSetLowValue() && newData.isSetHighValue()) {
                    densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs();
                }
                adjustedIndexMap.put(partName, (double) indexMap.get(partName));
                adjustedStatsMap.put(partName, cso.getStatsData());
            }
        } else {
            // we first merge all the adjacent bitvectors that we could merge and
            // derive new partition names and index.
            StringBuilder pseudoPartName = new StringBuilder();
            double pseudoIndexSum = 0;
            int length = 0;
            int curIndex = -1;
            DoubleColumnStatsData aggregateData = null;
            for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
                ColumnStatisticsObj cso = csp.getColStatsObj();
                String partName = csp.getPartName();
                DoubleColumnStatsDataInspector newData = doubleInspectorFromStats(cso);
                // already checked it before.
                if (indexMap.get(partName) != curIndex) {
                    // There is bitvector, but it is not adjacent to the previous ones.
                    if (length > 0) {
                        // we have to set ndv
                        adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length);
                        aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
                        ColumnStatisticsData csd = new ColumnStatisticsData();
                        csd.setDoubleStats(aggregateData);
                        adjustedStatsMap.put(pseudoPartName.toString(), csd);
                        if (useDensityFunctionForNDVEstimation) {
                            densityAvgSum += (aggregateData.getHighValue() - aggregateData.getLowValue()) / aggregateData.getNumDVs();
                        }
                        // reset everything
                        pseudoPartName = new StringBuilder();
                        pseudoIndexSum = 0;
                        length = 0;
                        ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator);
                    }
                    aggregateData = null;
                }
                curIndex = indexMap.get(partName);
                pseudoPartName.append(partName);
                pseudoIndexSum += curIndex;
                length++;
                curIndex++;
                if (aggregateData == null) {
                    aggregateData = newData.deepCopy();
                } else {
                    aggregateData.setLowValue(Math.min(aggregateData.getLowValue(), newData.getLowValue()));
                    aggregateData.setHighValue(Math.max(aggregateData.getHighValue(), newData.getHighValue()));
                    aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                }
                ndvEstimator.mergeEstimators(newData.getNdvEstimator());
            }
            if (length > 0) {
                // we have to set ndv
                adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length);
                aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
                ColumnStatisticsData csd = new ColumnStatisticsData();
                csd.setDoubleStats(aggregateData);
                adjustedStatsMap.put(pseudoPartName.toString(), csd);
                if (useDensityFunctionForNDVEstimation) {
                    densityAvgSum += (aggregateData.getHighValue() - aggregateData.getLowValue()) / aggregateData.getNumDVs();
                }
            }
        }
        extrapolate(columnStatisticsData, partNames.size(), colStatsWithSourceInfo.size(), adjustedIndexMap, adjustedStatsMap, densityAvgSum / adjustedStatsMap.size());
    }
    LOG.debug("Ndv estimatation for {} is {}. # of partitions requested: {}. # of partitions found: {}", colName, columnStatisticsData.getDoubleStats().getNumDVs(), partNames.size(), colStatsWithSourceInfo.size());
    statsObj.setStatsData(columnStatisticsData);
    return statsObj;
}
Also used : ColStatsObjWithSourceInfo(org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo) HashMap(java.util.HashMap) NumDistinctValueEstimator(org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) DoubleColumnStatsData(org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData) DoubleColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector) DoubleColumnStatsMerger(org.apache.hadoop.hive.metastore.columnstats.merge.DoubleColumnStatsMerger) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Aggregations

DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)45 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)30 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)27 StringColumnStatsData (org.apache.hadoop.hive.metastore.api.StringColumnStatsData)23 BinaryColumnStatsData (org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)22 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)22 DecimalColumnStatsData (org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData)22 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)22 DateColumnStatsData (org.apache.hadoop.hive.metastore.api.DateColumnStatsData)15 ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)12 Test (org.junit.Test)12 ArrayList (java.util.ArrayList)11 ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)11 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)6 SerDeInfo (org.apache.hadoop.hive.metastore.api.SerDeInfo)6 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)6 Table (org.apache.hadoop.hive.metastore.api.Table)6 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)5 Partition (org.apache.hadoop.hive.metastore.api.Partition)5 HashMap (java.util.HashMap)4