Search in sources :

Example 61 with ColumnStatisticsData._Fields

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData._Fields in project hive by apache.

the class HBaseUtils method protoBufStatsForOneColumn.

private static HbaseMetastoreProto.ColumnStats protoBufStatsForOneColumn(ColumnStatistics partitionColumnStats, ColumnStatisticsObj colStats) throws IOException {
    HbaseMetastoreProto.ColumnStats.Builder builder = HbaseMetastoreProto.ColumnStats.newBuilder();
    if (partitionColumnStats != null) {
        builder.setLastAnalyzed(partitionColumnStats.getStatsDesc().getLastAnalyzed());
    }
    assert colStats.getColType() != null;
    builder.setColumnType(colStats.getColType());
    assert colStats.getColName() != null;
    builder.setColumnName(colStats.getColName());
    ColumnStatisticsData colData = colStats.getStatsData();
    switch(colData.getSetField()) {
        case BOOLEAN_STATS:
            BooleanColumnStatsData boolData = colData.getBooleanStats();
            builder.setNumNulls(boolData.getNumNulls());
            builder.setBoolStats(HbaseMetastoreProto.ColumnStats.BooleanStats.newBuilder().setNumTrues(boolData.getNumTrues()).setNumFalses(boolData.getNumFalses()).build());
            break;
        case LONG_STATS:
            LongColumnStatsData longData = colData.getLongStats();
            builder.setNumNulls(longData.getNumNulls());
            builder.setNumDistinctValues(longData.getNumDVs());
            if (longData.isSetBitVectors()) {
                builder.setBitVectors(longData.getBitVectors());
            }
            builder.setLongStats(HbaseMetastoreProto.ColumnStats.LongStats.newBuilder().setLowValue(longData.getLowValue()).setHighValue(longData.getHighValue()).build());
            break;
        case DOUBLE_STATS:
            DoubleColumnStatsData doubleData = colData.getDoubleStats();
            builder.setNumNulls(doubleData.getNumNulls());
            builder.setNumDistinctValues(doubleData.getNumDVs());
            if (doubleData.isSetBitVectors()) {
                builder.setBitVectors(doubleData.getBitVectors());
            }
            builder.setDoubleStats(HbaseMetastoreProto.ColumnStats.DoubleStats.newBuilder().setLowValue(doubleData.getLowValue()).setHighValue(doubleData.getHighValue()).build());
            break;
        case STRING_STATS:
            StringColumnStatsData stringData = colData.getStringStats();
            builder.setNumNulls(stringData.getNumNulls());
            builder.setNumDistinctValues(stringData.getNumDVs());
            if (stringData.isSetBitVectors()) {
                builder.setBitVectors(stringData.getBitVectors());
            }
            builder.setStringStats(HbaseMetastoreProto.ColumnStats.StringStats.newBuilder().setMaxColLength(stringData.getMaxColLen()).setAvgColLength(stringData.getAvgColLen()).build());
            break;
        case BINARY_STATS:
            BinaryColumnStatsData binaryData = colData.getBinaryStats();
            builder.setNumNulls(binaryData.getNumNulls());
            builder.setBinaryStats(HbaseMetastoreProto.ColumnStats.StringStats.newBuilder().setMaxColLength(binaryData.getMaxColLen()).setAvgColLength(binaryData.getAvgColLen()).build());
            break;
        case DECIMAL_STATS:
            DecimalColumnStatsData decimalData = colData.getDecimalStats();
            builder.setNumNulls(decimalData.getNumNulls());
            builder.setNumDistinctValues(decimalData.getNumDVs());
            if (decimalData.isSetBitVectors()) {
                builder.setBitVectors(decimalData.getBitVectors());
            }
            if (decimalData.getLowValue() != null && decimalData.getHighValue() != null) {
                builder.setDecimalStats(HbaseMetastoreProto.ColumnStats.DecimalStats.newBuilder().setLowValue(HbaseMetastoreProto.ColumnStats.DecimalStats.Decimal.newBuilder().setUnscaled(ByteString.copyFrom(decimalData.getLowValue().getUnscaled())).setScale(decimalData.getLowValue().getScale()).build()).setHighValue(HbaseMetastoreProto.ColumnStats.DecimalStats.Decimal.newBuilder().setUnscaled(ByteString.copyFrom(decimalData.getHighValue().getUnscaled())).setScale(decimalData.getHighValue().getScale()).build())).build();
            } else {
                builder.setDecimalStats(HbaseMetastoreProto.ColumnStats.DecimalStats.newBuilder().clear().build());
            }
            break;
        default:
            throw new RuntimeException("Woh, bad.  Unknown stats type!");
    }
    return builder.build();
}
Also used : BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) DoubleColumnStatsData(org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData) DecimalColumnStatsData(org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData) StringColumnStatsData(org.apache.hadoop.hive.metastore.api.StringColumnStatsData) LongColumnStatsData(org.apache.hadoop.hive.metastore.api.LongColumnStatsData) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)

Example 62 with ColumnStatisticsData._Fields

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData._Fields in project hive by apache.

the class DecimalColumnStatsAggregator method aggregate.

@Override
public ColumnStatisticsObj aggregate(String colName, List<String> partNames, List<ColumnStatistics> css) throws MetaException {
    ColumnStatisticsObj statsObj = null;
    // check if all the ColumnStatisticsObjs contain stats and all the ndv are
    // bitvectors
    boolean doAllPartitionContainStats = partNames.size() == css.size();
    boolean isNDVBitVectorSet = true;
    String colType = null;
    for (ColumnStatistics cs : css) {
        if (cs.getStatsObjSize() != 1) {
            throw new MetaException("The number of columns should be exactly one in aggrStats, but found " + cs.getStatsObjSize());
        }
        ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
        if (statsObj == null) {
            colType = cso.getColType();
            statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso.getStatsData().getSetField());
        }
        if (numBitVectors <= 0 || !cso.getStatsData().getDecimalStats().isSetBitVectors() || cso.getStatsData().getDecimalStats().getBitVectors().length() == 0) {
            isNDVBitVectorSet = false;
            break;
        }
    }
    ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
    if (doAllPartitionContainStats || css.size() < 2) {
        DecimalColumnStatsData aggregateData = null;
        long lowerBound = 0;
        long higherBound = 0;
        double densityAvgSum = 0.0;
        NumDistinctValueEstimator ndvEstimator = null;
        if (isNDVBitVectorSet) {
            ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
        }
        for (ColumnStatistics cs : css) {
            ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
            DecimalColumnStatsData newData = cso.getStatsData().getDecimalStats();
            if (useDensityFunctionForNDVEstimation) {
                lowerBound = Math.max(lowerBound, newData.getNumDVs());
                higherBound += newData.getNumDVs();
                densityAvgSum += (HBaseUtils.getDoubleValue(newData.getHighValue()) - HBaseUtils.getDoubleValue(newData.getLowValue())) / newData.getNumDVs();
            }
            if (isNDVBitVectorSet) {
                ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), ndvEstimator.getnumBitVectors()));
            }
            if (aggregateData == null) {
                aggregateData = newData.deepCopy();
            } else {
                if (HBaseUtils.getDoubleValue(aggregateData.getLowValue()) < HBaseUtils.getDoubleValue(newData.getLowValue())) {
                    aggregateData.setLowValue(aggregateData.getLowValue());
                } else {
                    aggregateData.setLowValue(newData.getLowValue());
                }
                if (HBaseUtils.getDoubleValue(aggregateData.getHighValue()) > HBaseUtils.getDoubleValue(newData.getHighValue())) {
                    aggregateData.setHighValue(aggregateData.getHighValue());
                } else {
                    aggregateData.setHighValue(newData.getHighValue());
                }
                aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
            }
        }
        if (isNDVBitVectorSet) {
            // if all the ColumnStatisticsObjs contain bitvectors, we do not need to
            // use uniform distribution assumption because we can merge bitvectors
            // to get a good estimation.
            aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
        } else {
            if (useDensityFunctionForNDVEstimation) {
                // We have estimation, lowerbound and higherbound. We use estimation
                // if it is between lowerbound and higherbound.
                double densityAvg = densityAvgSum / partNames.size();
                long estimation = (long) ((HBaseUtils.getDoubleValue(aggregateData.getHighValue()) - HBaseUtils.getDoubleValue(aggregateData.getLowValue())) / densityAvg);
                if (estimation < lowerBound) {
                    aggregateData.setNumDVs(lowerBound);
                } else if (estimation > higherBound) {
                    aggregateData.setNumDVs(higherBound);
                } else {
                    aggregateData.setNumDVs(estimation);
                }
            } else {
            // Without useDensityFunctionForNDVEstimation, we just use the
            // default one, which is the max of all the partitions and it is
            // already done.
            }
        }
        columnStatisticsData.setDecimalStats(aggregateData);
    } else {
        // we need extrapolation
        Map<String, Integer> indexMap = new HashMap<String, Integer>();
        for (int index = 0; index < partNames.size(); index++) {
            indexMap.put(partNames.get(index), index);
        }
        Map<String, Double> adjustedIndexMap = new HashMap<String, Double>();
        Map<String, ColumnStatisticsData> adjustedStatsMap = new HashMap<String, ColumnStatisticsData>();
        // while we scan the css, we also get the densityAvg, lowerbound and
        // higerbound when useDensityFunctionForNDVEstimation is true.
        double densityAvgSum = 0.0;
        if (!isNDVBitVectorSet) {
            // the traditional extrapolation methods.
            for (ColumnStatistics cs : css) {
                String partName = cs.getStatsDesc().getPartName();
                ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
                DecimalColumnStatsData newData = cso.getStatsData().getDecimalStats();
                if (useDensityFunctionForNDVEstimation) {
                    densityAvgSum += (HBaseUtils.getDoubleValue(newData.getHighValue()) - HBaseUtils.getDoubleValue(newData.getLowValue())) / newData.getNumDVs();
                }
                adjustedIndexMap.put(partName, (double) indexMap.get(partName));
                adjustedStatsMap.put(partName, cso.getStatsData());
            }
        } else {
            // we first merge all the adjacent bitvectors that we could merge and
            // derive new partition names and index.
            NumDistinctValueEstimator ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
            StringBuilder pseudoPartName = new StringBuilder();
            double pseudoIndexSum = 0;
            int length = 0;
            int curIndex = -1;
            DecimalColumnStatsData aggregateData = null;
            for (ColumnStatistics cs : css) {
                String partName = cs.getStatsDesc().getPartName();
                ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
                DecimalColumnStatsData newData = cso.getStatsData().getDecimalStats();
                // already checked it before.
                if (indexMap.get(partName) != curIndex) {
                    // There is bitvector, but it is not adjacent to the previous ones.
                    if (length > 0) {
                        // we have to set ndv
                        adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length);
                        aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
                        ColumnStatisticsData csd = new ColumnStatisticsData();
                        csd.setDecimalStats(aggregateData);
                        adjustedStatsMap.put(pseudoPartName.toString(), csd);
                        if (useDensityFunctionForNDVEstimation) {
                            densityAvgSum += (HBaseUtils.getDoubleValue(aggregateData.getHighValue()) - HBaseUtils.getDoubleValue(aggregateData.getLowValue())) / aggregateData.getNumDVs();
                        }
                        // reset everything
                        pseudoPartName = new StringBuilder();
                        pseudoIndexSum = 0;
                        length = 0;
                    }
                    aggregateData = null;
                }
                curIndex = indexMap.get(partName);
                pseudoPartName.append(partName);
                pseudoIndexSum += curIndex;
                length++;
                curIndex++;
                if (aggregateData == null) {
                    aggregateData = newData.deepCopy();
                } else {
                    if (HBaseUtils.getDoubleValue(aggregateData.getLowValue()) < HBaseUtils.getDoubleValue(newData.getLowValue())) {
                        aggregateData.setLowValue(aggregateData.getLowValue());
                    } else {
                        aggregateData.setLowValue(newData.getLowValue());
                    }
                    if (HBaseUtils.getDoubleValue(aggregateData.getHighValue()) > HBaseUtils.getDoubleValue(newData.getHighValue())) {
                        aggregateData.setHighValue(aggregateData.getHighValue());
                    } else {
                        aggregateData.setHighValue(newData.getHighValue());
                    }
                    aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                }
                ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), ndvEstimator.getnumBitVectors()));
            }
            if (length > 0) {
                // we have to set ndv
                adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length);
                aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
                ColumnStatisticsData csd = new ColumnStatisticsData();
                csd.setDecimalStats(aggregateData);
                adjustedStatsMap.put(pseudoPartName.toString(), csd);
                if (useDensityFunctionForNDVEstimation) {
                    densityAvgSum += (HBaseUtils.getDoubleValue(aggregateData.getHighValue()) - HBaseUtils.getDoubleValue(aggregateData.getLowValue())) / aggregateData.getNumDVs();
                }
            }
        }
        extrapolate(columnStatisticsData, partNames.size(), css.size(), adjustedIndexMap, adjustedStatsMap, densityAvgSum / adjustedStatsMap.size());
    }
    statsObj.setStatsData(columnStatisticsData);
    return statsObj;
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) HashMap(java.util.HashMap) NumDistinctValueEstimator(org.apache.hadoop.hive.metastore.NumDistinctValueEstimator) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) DecimalColumnStatsData(org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) MetaException(org.apache.hadoop.hive.metastore.api.MetaException)

Example 63 with ColumnStatisticsData._Fields

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData._Fields in project hive by apache.

the class DoubleColumnStatsAggregator method extrapolate.

@Override
public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, int numPartsWithStats, Map<String, Double> adjustedIndexMap, Map<String, ColumnStatisticsData> adjustedStatsMap, double densityAvg) {
    int rightBorderInd = numParts;
    DoubleColumnStatsData extrapolateDoubleData = new DoubleColumnStatsData();
    Map<String, DoubleColumnStatsData> extractedAdjustedStatsMap = new HashMap<>();
    for (Map.Entry<String, ColumnStatisticsData> entry : adjustedStatsMap.entrySet()) {
        extractedAdjustedStatsMap.put(entry.getKey(), entry.getValue().getDoubleStats());
    }
    List<Map.Entry<String, DoubleColumnStatsData>> list = new LinkedList<Map.Entry<String, DoubleColumnStatsData>>(extractedAdjustedStatsMap.entrySet());
    // get the lowValue
    Collections.sort(list, new Comparator<Map.Entry<String, DoubleColumnStatsData>>() {

        public int compare(Map.Entry<String, DoubleColumnStatsData> o1, Map.Entry<String, DoubleColumnStatsData> o2) {
            return o1.getValue().getLowValue() < o2.getValue().getLowValue() ? -1 : 1;
        }
    });
    double minInd = adjustedIndexMap.get(list.get(0).getKey());
    double maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
    double lowValue = 0;
    double min = list.get(0).getValue().getLowValue();
    double max = list.get(list.size() - 1).getValue().getLowValue();
    if (minInd == maxInd) {
        lowValue = min;
    } else if (minInd < maxInd) {
        // left border is the min
        lowValue = (max - (max - min) * maxInd / (maxInd - minInd));
    } else {
        // right border is the min
        lowValue = (max - (max - min) * (rightBorderInd - maxInd) / (minInd - maxInd));
    }
    // get the highValue
    Collections.sort(list, new Comparator<Map.Entry<String, DoubleColumnStatsData>>() {

        public int compare(Map.Entry<String, DoubleColumnStatsData> o1, Map.Entry<String, DoubleColumnStatsData> o2) {
            return o1.getValue().getHighValue() < o2.getValue().getHighValue() ? -1 : 1;
        }
    });
    minInd = adjustedIndexMap.get(list.get(0).getKey());
    maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
    double highValue = 0;
    min = list.get(0).getValue().getHighValue();
    max = list.get(list.size() - 1).getValue().getHighValue();
    if (minInd == maxInd) {
        highValue = min;
    } else if (minInd < maxInd) {
        // right border is the max
        highValue = (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd));
    } else {
        // left border is the max
        highValue = (min + (max - min) * minInd / (minInd - maxInd));
    }
    // get the #nulls
    long numNulls = 0;
    for (Map.Entry<String, DoubleColumnStatsData> entry : extractedAdjustedStatsMap.entrySet()) {
        numNulls += entry.getValue().getNumNulls();
    }
    // we scale up sumNulls based on the number of partitions
    numNulls = numNulls * numParts / numPartsWithStats;
    // get the ndv
    long ndv = 0;
    long ndvMin = 0;
    long ndvMax = 0;
    Collections.sort(list, new Comparator<Map.Entry<String, DoubleColumnStatsData>>() {

        public int compare(Map.Entry<String, DoubleColumnStatsData> o1, Map.Entry<String, DoubleColumnStatsData> o2) {
            return o1.getValue().getNumDVs() < o2.getValue().getNumDVs() ? -1 : 1;
        }
    });
    long lowerBound = list.get(list.size() - 1).getValue().getNumDVs();
    long higherBound = 0;
    for (Map.Entry<String, DoubleColumnStatsData> entry : list) {
        higherBound += entry.getValue().getNumDVs();
    }
    if (useDensityFunctionForNDVEstimation && densityAvg != 0.0) {
        ndv = (long) ((highValue - lowValue) / densityAvg);
        if (ndv < lowerBound) {
            ndv = lowerBound;
        } else if (ndv > higherBound) {
            ndv = higherBound;
        }
    } else {
        minInd = adjustedIndexMap.get(list.get(0).getKey());
        maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
        ndvMin = list.get(0).getValue().getNumDVs();
        ndvMax = list.get(list.size() - 1).getValue().getNumDVs();
        if (minInd == maxInd) {
            ndv = ndvMin;
        } else if (minInd < maxInd) {
            // right border is the max
            ndv = (long) (ndvMin + (ndvMax - ndvMin) * (rightBorderInd - minInd) / (maxInd - minInd));
        } else {
            // left border is the max
            ndv = (long) (ndvMin + (ndvMax - ndvMin) * minInd / (minInd - maxInd));
        }
    }
    extrapolateDoubleData.setLowValue(lowValue);
    extrapolateDoubleData.setHighValue(highValue);
    extrapolateDoubleData.setNumNulls(numNulls);
    extrapolateDoubleData.setNumDVs(ndv);
    extrapolateData.setDoubleStats(extrapolateDoubleData);
}
Also used : HashMap(java.util.HashMap) LinkedList(java.util.LinkedList) DoubleColumnStatsData(org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData) Map(java.util.Map) HashMap(java.util.HashMap) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Example 64 with ColumnStatisticsData._Fields

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData._Fields in project hive by apache.

the class DoubleColumnStatsAggregator method aggregate.

@Override
public ColumnStatisticsObj aggregate(String colName, List<String> partNames, List<ColumnStatistics> css) throws MetaException {
    ColumnStatisticsObj statsObj = null;
    // check if all the ColumnStatisticsObjs contain stats and all the ndv are
    // bitvectors
    boolean doAllPartitionContainStats = partNames.size() == css.size();
    boolean isNDVBitVectorSet = true;
    String colType = null;
    for (ColumnStatistics cs : css) {
        if (cs.getStatsObjSize() != 1) {
            throw new MetaException("The number of columns should be exactly one in aggrStats, but found " + cs.getStatsObjSize());
        }
        ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
        if (statsObj == null) {
            colType = cso.getColType();
            statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso.getStatsData().getSetField());
        }
        if (numBitVectors <= 0 || !cso.getStatsData().getDoubleStats().isSetBitVectors() || cso.getStatsData().getDoubleStats().getBitVectors().length() == 0) {
            isNDVBitVectorSet = false;
            break;
        }
    }
    ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
    if (doAllPartitionContainStats || css.size() < 2) {
        DoubleColumnStatsData aggregateData = null;
        long lowerBound = 0;
        long higherBound = 0;
        double densityAvgSum = 0.0;
        NumDistinctValueEstimator ndvEstimator = null;
        if (isNDVBitVectorSet) {
            ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
        }
        for (ColumnStatistics cs : css) {
            ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
            DoubleColumnStatsData newData = cso.getStatsData().getDoubleStats();
            if (useDensityFunctionForNDVEstimation) {
                lowerBound = Math.max(lowerBound, newData.getNumDVs());
                higherBound += newData.getNumDVs();
                densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs();
            }
            if (isNDVBitVectorSet) {
                ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), ndvEstimator.getnumBitVectors()));
            }
            if (aggregateData == null) {
                aggregateData = newData.deepCopy();
            } else {
                aggregateData.setLowValue(Math.min(aggregateData.getLowValue(), newData.getLowValue()));
                aggregateData.setHighValue(Math.max(aggregateData.getHighValue(), newData.getHighValue()));
                aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
            }
        }
        if (isNDVBitVectorSet) {
            // if all the ColumnStatisticsObjs contain bitvectors, we do not need to
            // use uniform distribution assumption because we can merge bitvectors
            // to get a good estimation.
            aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
        } else {
            if (useDensityFunctionForNDVEstimation) {
                // We have estimation, lowerbound and higherbound. We use estimation
                // if it is between lowerbound and higherbound.
                double densityAvg = densityAvgSum / partNames.size();
                long estimation = (long) ((aggregateData.getHighValue() - aggregateData.getLowValue()) / densityAvg);
                if (estimation < lowerBound) {
                    aggregateData.setNumDVs(lowerBound);
                } else if (estimation > higherBound) {
                    aggregateData.setNumDVs(higherBound);
                } else {
                    aggregateData.setNumDVs(estimation);
                }
            } else {
            // Without useDensityFunctionForNDVEstimation, we just use the
            // default one, which is the max of all the partitions and it is
            // already done.
            }
        }
        columnStatisticsData.setDoubleStats(aggregateData);
    } else {
        // we need extrapolation
        Map<String, Integer> indexMap = new HashMap<String, Integer>();
        for (int index = 0; index < partNames.size(); index++) {
            indexMap.put(partNames.get(index), index);
        }
        Map<String, Double> adjustedIndexMap = new HashMap<String, Double>();
        Map<String, ColumnStatisticsData> adjustedStatsMap = new HashMap<String, ColumnStatisticsData>();
        // while we scan the css, we also get the densityAvg, lowerbound and
        // higerbound when useDensityFunctionForNDVEstimation is true.
        double densityAvgSum = 0.0;
        if (!isNDVBitVectorSet) {
            // the traditional extrapolation methods.
            for (ColumnStatistics cs : css) {
                String partName = cs.getStatsDesc().getPartName();
                ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
                DoubleColumnStatsData newData = cso.getStatsData().getDoubleStats();
                if (useDensityFunctionForNDVEstimation) {
                    densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs();
                }
                adjustedIndexMap.put(partName, (double) indexMap.get(partName));
                adjustedStatsMap.put(partName, cso.getStatsData());
            }
        } else {
            // we first merge all the adjacent bitvectors that we could merge and
            // derive new partition names and index.
            NumDistinctValueEstimator ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
            StringBuilder pseudoPartName = new StringBuilder();
            double pseudoIndexSum = 0;
            int length = 0;
            int curIndex = -1;
            DoubleColumnStatsData aggregateData = null;
            for (ColumnStatistics cs : css) {
                String partName = cs.getStatsDesc().getPartName();
                ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
                DoubleColumnStatsData newData = cso.getStatsData().getDoubleStats();
                // already checked it before.
                if (indexMap.get(partName) != curIndex) {
                    // There is bitvector, but it is not adjacent to the previous ones.
                    if (length > 0) {
                        // we have to set ndv
                        adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length);
                        aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
                        ColumnStatisticsData csd = new ColumnStatisticsData();
                        csd.setDoubleStats(aggregateData);
                        adjustedStatsMap.put(pseudoPartName.toString(), csd);
                        if (useDensityFunctionForNDVEstimation) {
                            densityAvgSum += (aggregateData.getHighValue() - aggregateData.getLowValue()) / aggregateData.getNumDVs();
                        }
                        // reset everything
                        pseudoPartName = new StringBuilder();
                        pseudoIndexSum = 0;
                        length = 0;
                    }
                    aggregateData = null;
                }
                curIndex = indexMap.get(partName);
                pseudoPartName.append(partName);
                pseudoIndexSum += curIndex;
                length++;
                curIndex++;
                if (aggregateData == null) {
                    aggregateData = newData.deepCopy();
                } else {
                    aggregateData.setLowValue(Math.min(aggregateData.getLowValue(), newData.getLowValue()));
                    aggregateData.setHighValue(Math.max(aggregateData.getHighValue(), newData.getHighValue()));
                    aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                }
                ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), ndvEstimator.getnumBitVectors()));
            }
            if (length > 0) {
                // we have to set ndv
                adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length);
                aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
                ColumnStatisticsData csd = new ColumnStatisticsData();
                csd.setDoubleStats(aggregateData);
                adjustedStatsMap.put(pseudoPartName.toString(), csd);
                if (useDensityFunctionForNDVEstimation) {
                    densityAvgSum += (aggregateData.getHighValue() - aggregateData.getLowValue()) / aggregateData.getNumDVs();
                }
            }
        }
        extrapolate(columnStatisticsData, partNames.size(), css.size(), adjustedIndexMap, adjustedStatsMap, densityAvgSum / adjustedStatsMap.size());
    }
    statsObj.setStatsData(columnStatisticsData);
    return statsObj;
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) HashMap(java.util.HashMap) NumDistinctValueEstimator(org.apache.hadoop.hive.metastore.NumDistinctValueEstimator) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) DoubleColumnStatsData(org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) MetaException(org.apache.hadoop.hive.metastore.api.MetaException)

Example 65 with ColumnStatisticsData._Fields

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData._Fields in project hive by apache.

the class LongColumnStatsAggregator method extrapolate.

@Override
public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, int numPartsWithStats, Map<String, Double> adjustedIndexMap, Map<String, ColumnStatisticsData> adjustedStatsMap, double densityAvg) {
    int rightBorderInd = numParts;
    LongColumnStatsData extrapolateLongData = new LongColumnStatsData();
    Map<String, LongColumnStatsData> extractedAdjustedStatsMap = new HashMap<>();
    for (Map.Entry<String, ColumnStatisticsData> entry : adjustedStatsMap.entrySet()) {
        extractedAdjustedStatsMap.put(entry.getKey(), entry.getValue().getLongStats());
    }
    List<Map.Entry<String, LongColumnStatsData>> list = new LinkedList<Map.Entry<String, LongColumnStatsData>>(extractedAdjustedStatsMap.entrySet());
    // get the lowValue
    Collections.sort(list, new Comparator<Map.Entry<String, LongColumnStatsData>>() {

        public int compare(Map.Entry<String, LongColumnStatsData> o1, Map.Entry<String, LongColumnStatsData> o2) {
            return o1.getValue().getLowValue() < o2.getValue().getLowValue() ? -1 : 1;
        }
    });
    double minInd = adjustedIndexMap.get(list.get(0).getKey());
    double maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
    long lowValue = 0;
    long min = list.get(0).getValue().getLowValue();
    long max = list.get(list.size() - 1).getValue().getLowValue();
    if (minInd == maxInd) {
        lowValue = min;
    } else if (minInd < maxInd) {
        // left border is the min
        lowValue = (long) (max - (max - min) * maxInd / (maxInd - minInd));
    } else {
        // right border is the min
        lowValue = (long) (max - (max - min) * (rightBorderInd - maxInd) / (minInd - maxInd));
    }
    // get the highValue
    Collections.sort(list, new Comparator<Map.Entry<String, LongColumnStatsData>>() {

        public int compare(Map.Entry<String, LongColumnStatsData> o1, Map.Entry<String, LongColumnStatsData> o2) {
            return o1.getValue().getHighValue() < o2.getValue().getHighValue() ? -1 : 1;
        }
    });
    minInd = adjustedIndexMap.get(list.get(0).getKey());
    maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
    long highValue = 0;
    min = list.get(0).getValue().getHighValue();
    max = list.get(list.size() - 1).getValue().getHighValue();
    if (minInd == maxInd) {
        highValue = min;
    } else if (minInd < maxInd) {
        // right border is the max
        highValue = (long) (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd));
    } else {
        // left border is the max
        highValue = (long) (min + (max - min) * minInd / (minInd - maxInd));
    }
    // get the #nulls
    long numNulls = 0;
    for (Map.Entry<String, LongColumnStatsData> entry : extractedAdjustedStatsMap.entrySet()) {
        numNulls += entry.getValue().getNumNulls();
    }
    // we scale up sumNulls based on the number of partitions
    numNulls = numNulls * numParts / numPartsWithStats;
    // get the ndv
    long ndv = 0;
    Collections.sort(list, new Comparator<Map.Entry<String, LongColumnStatsData>>() {

        public int compare(Map.Entry<String, LongColumnStatsData> o1, Map.Entry<String, LongColumnStatsData> o2) {
            return o1.getValue().getNumDVs() < o2.getValue().getNumDVs() ? -1 : 1;
        }
    });
    long lowerBound = list.get(list.size() - 1).getValue().getNumDVs();
    long higherBound = 0;
    for (Map.Entry<String, LongColumnStatsData> entry : list) {
        higherBound += entry.getValue().getNumDVs();
    }
    if (useDensityFunctionForNDVEstimation && densityAvg != 0.0) {
        ndv = (long) ((highValue - lowValue) / densityAvg);
        if (ndv < lowerBound) {
            ndv = lowerBound;
        } else if (ndv > higherBound) {
            ndv = higherBound;
        }
    } else {
        minInd = adjustedIndexMap.get(list.get(0).getKey());
        maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
        min = list.get(0).getValue().getNumDVs();
        max = list.get(list.size() - 1).getValue().getNumDVs();
        if (minInd == maxInd) {
            ndv = min;
        } else if (minInd < maxInd) {
            // right border is the max
            ndv = (long) (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd));
        } else {
            // left border is the max
            ndv = (long) (min + (max - min) * minInd / (minInd - maxInd));
        }
    }
    extrapolateLongData.setLowValue(lowValue);
    extrapolateLongData.setHighValue(highValue);
    extrapolateLongData.setNumNulls(numNulls);
    extrapolateLongData.setNumDVs(ndv);
    extrapolateData.setLongStats(extrapolateLongData);
}
Also used : HashMap(java.util.HashMap) LongColumnStatsData(org.apache.hadoop.hive.metastore.api.LongColumnStatsData) LinkedList(java.util.LinkedList) Map(java.util.Map) HashMap(java.util.HashMap) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Aggregations

ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)108 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)95 ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)62 ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)56 Test (org.junit.Test)53 ArrayList (java.util.ArrayList)47 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)36 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)34 Table (org.apache.hadoop.hive.metastore.api.Table)33 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)32 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)31 SerDeInfo (org.apache.hadoop.hive.metastore.api.SerDeInfo)31 Partition (org.apache.hadoop.hive.metastore.api.Partition)30 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)29 DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)28 StringColumnStatsData (org.apache.hadoop.hive.metastore.api.StringColumnStatsData)26 DecimalColumnStatsData (org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData)24 BinaryColumnStatsData (org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)23 HashMap (java.util.HashMap)22 List (java.util.List)19