Examples with ColStatsObjWithSourceInfo - org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo

Example 6 with ColStatsObjWithSourceInfo

use of org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo in project hive by apache.

the class DecimalColumnStatsAggregator method aggregate.

@Override
public ColumnStatisticsObj aggregate(List<ColStatsObjWithSourceInfo> colStatsWithSourceInfo, List<String> partNames, boolean areAllPartsFound) throws MetaException {
    ColumnStatisticsObj statsObj = null;
    String colType = null;
    String colName = null;
    // check if all the ColumnStatisticsObjs contain stats and all the ndv are
    // bitvectors
    boolean doAllPartitionContainStats = partNames.size() == colStatsWithSourceInfo.size();
    NumDistinctValueEstimator ndvEstimator = null;
    for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
        ColumnStatisticsObj cso = csp.getColStatsObj();
        if (statsObj == null) {
            colName = cso.getColName();
            colType = cso.getColType();
            statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso.getStatsData().getSetField());
            LOG.trace("doAllPartitionContainStats for column: {} is: {}", colName, doAllPartitionContainStats);
        }
        DecimalColumnStatsDataInspector decimalColumnStatsData = decimalInspectorFromStats(cso);
        if (decimalColumnStatsData.getNdvEstimator() == null) {
            ndvEstimator = null;
            break;
        } else {
            // check if all of the bit vectors can merge
            NumDistinctValueEstimator estimator = decimalColumnStatsData.getNdvEstimator();
            if (ndvEstimator == null) {
                ndvEstimator = estimator;
            } else {
                if (ndvEstimator.canMerge(estimator)) {
                    continue;
                } else {
                    ndvEstimator = null;
                    break;
                }
            }
        }
    }
    if (ndvEstimator != null) {
        ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator);
    }
    LOG.debug("all of the bit vectors can merge for " + colName + " is " + (ndvEstimator != null));
    ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
    if (doAllPartitionContainStats || colStatsWithSourceInfo.size() < 2) {
        DecimalColumnStatsDataInspector aggregateData = null;
        long lowerBound = 0;
        long higherBound = 0;
        double densityAvgSum = 0.0;
        for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
            ColumnStatisticsObj cso = csp.getColStatsObj();
            DecimalColumnStatsDataInspector newData = decimalInspectorFromStats(cso);
            lowerBound = Math.max(lowerBound, newData.getNumDVs());
            higherBound += newData.getNumDVs();
            if (newData.isSetLowValue() && newData.isSetHighValue()) {
                densityAvgSum += (MetaStoreServerUtils.decimalToDouble(newData.getHighValue()) - MetaStoreServerUtils.decimalToDouble(newData.getLowValue())) / newData.getNumDVs();
            }
            if (ndvEstimator != null) {
                ndvEstimator.mergeEstimators(newData.getNdvEstimator());
            }
            if (aggregateData == null) {
                aggregateData = newData.deepCopy();
            } else {
                DecimalColumnStatsMerger merger = new DecimalColumnStatsMerger();
                merger.setLowValue(aggregateData, newData);
                merger.setHighValue(aggregateData, newData);
                aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
            }
        }
        if (ndvEstimator != null) {
            // if all the ColumnStatisticsObjs contain bitvectors, we do not need to
            // use uniform distribution assumption because we can merge bitvectors
            // to get a good estimation.
            aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
        } else {
            long estimation;
            if (useDensityFunctionForNDVEstimation) {
                // We have estimation, lowerbound and higherbound. We use estimation
                // if it is between lowerbound and higherbound.
                double densityAvg = densityAvgSum / partNames.size();
                estimation = (long) ((MetaStoreServerUtils.decimalToDouble(aggregateData.getHighValue()) - MetaStoreServerUtils.decimalToDouble(aggregateData.getLowValue())) / densityAvg);
                if (estimation < lowerBound) {
                    estimation = lowerBound;
                } else if (estimation > higherBound) {
                    estimation = higherBound;
                }
            } else {
                estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner);
            }
            aggregateData.setNumDVs(estimation);
        }
        columnStatisticsData.setDecimalStats(aggregateData);
    } else {
        // we need extrapolation
        LOG.debug("start extrapolation for " + colName);
        Map<String, Integer> indexMap = new HashMap<>();
        for (int index = 0; index < partNames.size(); index++) {
            indexMap.put(partNames.get(index), index);
        }
        Map<String, Double> adjustedIndexMap = new HashMap<>();
        Map<String, ColumnStatisticsData> adjustedStatsMap = new HashMap<>();
        // while we scan the css, we also get the densityAvg, lowerbound and
        // higerbound when useDensityFunctionForNDVEstimation is true.
        double densityAvgSum = 0.0;
        if (ndvEstimator == null) {
            // the traditional extrapolation methods.
            for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
                ColumnStatisticsObj cso = csp.getColStatsObj();
                String partName = csp.getPartName();
                DecimalColumnStatsData newData = cso.getStatsData().getDecimalStats();
                if (useDensityFunctionForNDVEstimation) {
                    densityAvgSum += (MetaStoreServerUtils.decimalToDouble(newData.getHighValue()) - MetaStoreServerUtils.decimalToDouble(newData.getLowValue())) / newData.getNumDVs();
                }
                adjustedIndexMap.put(partName, (double) indexMap.get(partName));
                adjustedStatsMap.put(partName, cso.getStatsData());
            }
        } else {
            // we first merge all the adjacent bitvectors that we could merge and
            // derive new partition names and index.
            StringBuilder pseudoPartName = new StringBuilder();
            double pseudoIndexSum = 0;
            int length = 0;
            int curIndex = -1;
            DecimalColumnStatsDataInspector aggregateData = null;
            for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
                ColumnStatisticsObj cso = csp.getColStatsObj();
                String partName = csp.getPartName();
                DecimalColumnStatsDataInspector newData = decimalInspectorFromStats(cso);
                // already checked it before.
                if (indexMap.get(partName) != curIndex) {
                    // There is bitvector, but it is not adjacent to the previous ones.
                    if (length > 0) {
                        // we have to set ndv
                        adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length);
                        aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
                        ColumnStatisticsData csd = new ColumnStatisticsData();
                        csd.setDecimalStats(aggregateData);
                        adjustedStatsMap.put(pseudoPartName.toString(), csd);
                        if (useDensityFunctionForNDVEstimation) {
                            densityAvgSum += (MetaStoreServerUtils.decimalToDouble(aggregateData.getHighValue()) - MetaStoreServerUtils.decimalToDouble(aggregateData.getLowValue())) / aggregateData.getNumDVs();
                        }
                        // reset everything
                        pseudoPartName = new StringBuilder();
                        pseudoIndexSum = 0;
                        length = 0;
                        ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator);
                    }
                    aggregateData = null;
                }
                curIndex = indexMap.get(partName);
                pseudoPartName.append(partName);
                pseudoIndexSum += curIndex;
                length++;
                curIndex++;
                if (aggregateData == null) {
                    aggregateData = newData.deepCopy();
                } else {
                    if (MetaStoreServerUtils.decimalToDouble(aggregateData.getLowValue()) < MetaStoreServerUtils.decimalToDouble(newData.getLowValue())) {
                        aggregateData.setLowValue(aggregateData.getLowValue());
                    } else {
                        aggregateData.setLowValue(newData.getLowValue());
                    }
                    if (MetaStoreServerUtils.decimalToDouble(aggregateData.getHighValue()) > MetaStoreServerUtils.decimalToDouble(newData.getHighValue())) {
                        aggregateData.setHighValue(aggregateData.getHighValue());
                    } else {
                        aggregateData.setHighValue(newData.getHighValue());
                    }
                    aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                }
                ndvEstimator.mergeEstimators(newData.getNdvEstimator());
            }
            if (length > 0) {
                // we have to set ndv
                adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length);
                aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
                ColumnStatisticsData csd = new ColumnStatisticsData();
                csd.setDecimalStats(aggregateData);
                adjustedStatsMap.put(pseudoPartName.toString(), csd);
                if (useDensityFunctionForNDVEstimation) {
                    densityAvgSum += (MetaStoreServerUtils.decimalToDouble(aggregateData.getHighValue()) - MetaStoreServerUtils.decimalToDouble(aggregateData.getLowValue())) / aggregateData.getNumDVs();
                }
            }
        }
        extrapolate(columnStatisticsData, partNames.size(), colStatsWithSourceInfo.size(), adjustedIndexMap, adjustedStatsMap, densityAvgSum / adjustedStatsMap.size());
    }
    LOG.debug("Ndv estimatation for {} is {} # of partitions requested: {} # of partitions found: {}", colName, columnStatisticsData.getDecimalStats().getNumDVs(), partNames.size(), colStatsWithSourceInfo.size());
    statsObj.setStatsData(columnStatisticsData);
    return statsObj;
}

Also used : ColStatsObjWithSourceInfo(org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo) HashMap(java.util.HashMap) NumDistinctValueEstimator(org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) DecimalColumnStatsData(org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData) DecimalColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) DecimalColumnStatsMerger(org.apache.hadoop.hive.metastore.columnstats.merge.DecimalColumnStatsMerger)

Example 7 with ColStatsObjWithSourceInfo

use of org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo in project hive by apache.

the class BinaryColumnStatsAggregator method aggregate.

@Override
public ColumnStatisticsObj aggregate(List<ColStatsObjWithSourceInfo> colStatsWithSourceInfo, List<String> partNames, boolean areAllPartsFound) throws MetaException {
    ColumnStatisticsObj statsObj = null;
    String colType = null;
    String colName = null;
    BinaryColumnStatsData aggregateData = null;
    for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
        ColumnStatisticsObj cso = csp.getColStatsObj();
        if (statsObj == null) {
            colName = cso.getColName();
            colType = cso.getColType();
            statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso.getStatsData().getSetField());
        }
        BinaryColumnStatsData newData = cso.getStatsData().getBinaryStats();
        if (aggregateData == null) {
            aggregateData = newData.deepCopy();
        } else {
            aggregateData.setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen()));
            aggregateData.setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen()));
            aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
        }
    }
    ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
    columnStatisticsData.setBinaryStats(aggregateData);
    statsObj.setStatsData(columnStatisticsData);
    return statsObj;
}

Also used : ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColStatsObjWithSourceInfo(org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Example 8 with ColStatsObjWithSourceInfo

use of org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo in project hive by apache.

the class StringColumnStatsAggregator method aggregate.

@Override
public ColumnStatisticsObj aggregate(List<ColStatsObjWithSourceInfo> colStatsWithSourceInfo, List<String> partNames, boolean areAllPartsFound) throws MetaException {
    ColumnStatisticsObj statsObj = null;
    String colType = null;
    String colName = null;
    // check if all the ColumnStatisticsObjs contain stats and all the ndv are
    // bitvectors
    boolean doAllPartitionContainStats = partNames.size() == colStatsWithSourceInfo.size();
    NumDistinctValueEstimator ndvEstimator = null;
    for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
        ColumnStatisticsObj cso = csp.getColStatsObj();
        if (statsObj == null) {
            colName = cso.getColName();
            colType = cso.getColType();
            statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso.getStatsData().getSetField());
            LOG.trace("doAllPartitionContainStats for column: {} is: {}", colName, doAllPartitionContainStats);
        }
        StringColumnStatsDataInspector stringColumnStatsData = stringInspectorFromStats(cso);
        if (stringColumnStatsData.getNdvEstimator() == null) {
            ndvEstimator = null;
            break;
        } else {
            // check if all of the bit vectors can merge
            NumDistinctValueEstimator estimator = stringColumnStatsData.getNdvEstimator();
            if (ndvEstimator == null) {
                ndvEstimator = estimator;
            } else {
                if (ndvEstimator.canMerge(estimator)) {
                    continue;
                } else {
                    ndvEstimator = null;
                    break;
                }
            }
        }
    }
    if (ndvEstimator != null) {
        ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator);
    }
    LOG.debug("all of the bit vectors can merge for " + colName + " is " + (ndvEstimator != null));
    ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
    if (doAllPartitionContainStats || colStatsWithSourceInfo.size() < 2) {
        StringColumnStatsDataInspector aggregateData = null;
        for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
            ColumnStatisticsObj cso = csp.getColStatsObj();
            StringColumnStatsDataInspector newData = stringInspectorFromStats(cso);
            if (ndvEstimator != null) {
                ndvEstimator.mergeEstimators(newData.getNdvEstimator());
            }
            if (aggregateData == null) {
                aggregateData = newData.deepCopy();
            } else {
                aggregateData.setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen()));
                aggregateData.setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen()));
                aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
            }
        }
        if (ndvEstimator != null) {
            // if all the ColumnStatisticsObjs contain bitvectors, we do not need to
            // use uniform distribution assumption because we can merge bitvectors
            // to get a good estimation.
            aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
        } else {
        // aggregateData already has the ndv of the max of all
        }
        columnStatisticsData.setStringStats(aggregateData);
    } else {
        // we need extrapolation
        LOG.debug("start extrapolation for " + colName);
        Map<String, Integer> indexMap = new HashMap<>();
        for (int index = 0; index < partNames.size(); index++) {
            indexMap.put(partNames.get(index), index);
        }
        Map<String, Double> adjustedIndexMap = new HashMap<>();
        Map<String, ColumnStatisticsData> adjustedStatsMap = new HashMap<>();
        if (ndvEstimator == null) {
            // the traditional extrapolation methods.
            for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
                ColumnStatisticsObj cso = csp.getColStatsObj();
                String partName = csp.getPartName();
                adjustedIndexMap.put(partName, (double) indexMap.get(partName));
                adjustedStatsMap.put(partName, cso.getStatsData());
            }
        } else {
            // we first merge all the adjacent bitvectors that we could merge and
            // derive new partition names and index.
            StringBuilder pseudoPartName = new StringBuilder();
            double pseudoIndexSum = 0;
            int length = 0;
            int curIndex = -1;
            StringColumnStatsDataInspector aggregateData = null;
            for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
                ColumnStatisticsObj cso = csp.getColStatsObj();
                String partName = csp.getPartName();
                StringColumnStatsDataInspector newData = stringInspectorFromStats(cso);
                // already checked it before.
                if (indexMap.get(partName) != curIndex) {
                    // There is bitvector, but it is not adjacent to the previous ones.
                    if (length > 0) {
                        // we have to set ndv
                        adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length);
                        aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
                        ColumnStatisticsData csd = new ColumnStatisticsData();
                        csd.setStringStats(aggregateData);
                        adjustedStatsMap.put(pseudoPartName.toString(), csd);
                        // reset everything
                        pseudoPartName = new StringBuilder();
                        pseudoIndexSum = 0;
                        length = 0;
                        ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator);
                    }
                    aggregateData = null;
                }
                curIndex = indexMap.get(partName);
                pseudoPartName.append(partName);
                pseudoIndexSum += curIndex;
                length++;
                curIndex++;
                if (aggregateData == null) {
                    aggregateData = newData.deepCopy();
                } else {
                    aggregateData.setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen()));
                    aggregateData.setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen()));
                    aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                }
                ndvEstimator.mergeEstimators(newData.getNdvEstimator());
            }
            if (length > 0) {
                // we have to set ndv
                adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length);
                aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
                ColumnStatisticsData csd = new ColumnStatisticsData();
                csd.setStringStats(aggregateData);
                adjustedStatsMap.put(pseudoPartName.toString(), csd);
            }
        }
        extrapolate(columnStatisticsData, partNames.size(), colStatsWithSourceInfo.size(), adjustedIndexMap, adjustedStatsMap, -1);
    }
    LOG.debug("Ndv estimatation for {} is {} # of partitions requested: {} # of partitions found: {}", colName, columnStatisticsData.getStringStats().getNumDVs(), partNames.size(), colStatsWithSourceInfo.size());
    statsObj.setStatsData(columnStatisticsData);
    return statsObj;
}

Also used : ColStatsObjWithSourceInfo(org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo) HashMap(java.util.HashMap) NumDistinctValueEstimator(org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) StringColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Example 9 with ColStatsObjWithSourceInfo

use of org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo in project hive by apache.

the class CachedStore method mergeColStatsForPartitions.

private MergedColumnStatsForPartitions mergeColStatsForPartitions(String catName, String dbName, String tblName, List<String> partNames, List<String> colNames, SharedCache sharedCache, StatsType type, String writeIdList) throws MetaException {
    final boolean useDensityFunctionForNDVEstimation = MetastoreConf.getBoolVar(getConf(), ConfVars.STATS_NDV_DENSITY_FUNCTION);
    final double ndvTuner = MetastoreConf.getDoubleVar(getConf(), ConfVars.STATS_NDV_TUNER);
    Map<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> colStatsMap = new HashMap<>();
    long partsFound = partNames.size();
    Map<List<String>, Long> partNameToWriteId = writeIdList != null ? new HashMap<>() : null;
    for (String colName : colNames) {
        long partsFoundForColumn = 0;
        ColumnStatsAggregator colStatsAggregator = null;
        List<ColStatsObjWithSourceInfo> colStatsWithPartInfoList = new ArrayList<>();
        for (String partName : partNames) {
            List<String> partValue = partNameToVals(partName);
            // There are three possible result from getPartitionColStatsFromCache.
            // 1. The partition has valid stats and thus colStatsWriteId returned is valid non-null value
            // 2. Partition stat is missing from cache and thus colStatsWriteId returned is non-null but colstat
            // info in it is null. In this case we just ignore the partition from aggregate calculation to keep
            // the behavior same as object store.
            // 3. Partition is missing or its stat is updated by live(not yet committed) or aborted txn. In this case,
            // colStatsWriteId is null. Thus null is returned to keep the behavior same as object store.
            SharedCache.ColumStatsWithWriteId colStatsWriteId = sharedCache.getPartitionColStatsFromCache(catName, dbName, tblName, partValue, colName, writeIdList);
            if (colStatsWriteId == null) {
                return null;
            }
            if (colStatsWriteId.getColumnStatisticsObj() != null) {
                ColumnStatisticsObj colStatsForPart = colStatsWriteId.getColumnStatisticsObj();
                if (partNameToWriteId != null) {
                    partNameToWriteId.put(partValue, colStatsWriteId.getWriteId());
                }
                ColStatsObjWithSourceInfo colStatsWithPartInfo = new ColStatsObjWithSourceInfo(colStatsForPart, catName, dbName, tblName, partName);
                colStatsWithPartInfoList.add(colStatsWithPartInfo);
                if (colStatsAggregator == null) {
                    colStatsAggregator = ColumnStatsAggregatorFactory.getColumnStatsAggregator(colStatsForPart.getStatsData().getSetField(), useDensityFunctionForNDVEstimation, ndvTuner);
                }
                partsFoundForColumn++;
            } else {
                LOG.debug("Stats not found in CachedStore for: dbName={} tblName={} partName={} colName={}", dbName, tblName, partName, colName);
            }
        }
        if (colStatsWithPartInfoList.size() > 0) {
            colStatsMap.put(colStatsAggregator, colStatsWithPartInfoList);
        }
        // which stats for all columns are present in the cache.
        if (partsFoundForColumn < partsFound) {
            partsFound = partsFoundForColumn;
        }
        if (colStatsMap.size() < 1) {
            LOG.debug("No stats data found for: dbName={} tblName= {} partNames= {} colNames= ", dbName, tblName, partNames, colNames);
            // trigger the lookup in the raw store and we will end up with missing stats.
            return new MergedColumnStatsForPartitions(new ArrayList<ColumnStatisticsObj>(), 0);
        }
    }
    // Note that enableBitVector does not apply here because ColumnStatisticsObj
    // itself will tell whether bitvector is null or not and aggr logic can automatically apply.
    List<ColumnStatisticsObj> colAggrStats = MetaStoreServerUtils.aggrPartitionStats(colStatsMap, partNames, partsFound == partNames.size(), useDensityFunctionForNDVEstimation, ndvTuner);
    if (canUseEvents) {
        if (type == StatsType.ALL) {
            sharedCache.refreshAggregateStatsInCache(StringUtils.normalizeIdentifier(catName), StringUtils.normalizeIdentifier(dbName), StringUtils.normalizeIdentifier(tblName), new AggrStats(colAggrStats, partsFound), null, partNameToWriteId);
        } else if (type == StatsType.ALLBUTDEFAULT) {
            sharedCache.refreshAggregateStatsInCache(StringUtils.normalizeIdentifier(catName), StringUtils.normalizeIdentifier(dbName), StringUtils.normalizeIdentifier(tblName), null, new AggrStats(colAggrStats, partsFound), partNameToWriteId);
        }
    }
    return new MergedColumnStatsForPartitions(colAggrStats, partsFound);
}

Also used : ColStatsObjWithSourceInfo(org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ColumnStatsAggregator(org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator) List(java.util.List) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList)

Example 10 with ColStatsObjWithSourceInfo

use of org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo in project hive by apache.

the class MetaStoreDirectSql method getColStatsForAllTablePartitions.

public List<ColStatsObjWithSourceInfo> getColStatsForAllTablePartitions(String catName, String dbName, boolean enableBitVector) throws MetaException {
    String queryText = "select \"TABLE_NAME\", \"PARTITION_NAME\", " + getStatsList(enableBitVector) + " from " + " " + PART_COL_STATS + " where \"DB_NAME\" = ? and \"CAT_NAME\" = ?";
    long start = 0;
    long end = 0;
    boolean doTrace = LOG.isDebugEnabled();
    Object qResult = null;
    start = doTrace ? System.nanoTime() : 0;
    List<ColStatsObjWithSourceInfo> colStatsForDB = new ArrayList<ColStatsObjWithSourceInfo>();
    try (QueryWrapper query = new QueryWrapper(pm.newQuery("javax.jdo.query.SQL", queryText))) {
        qResult = executeWithArray(query, new Object[] { dbName, catName }, queryText);
        if (qResult == null) {
            return colStatsForDB;
        }
        end = doTrace ? System.nanoTime() : 0;
        MetastoreDirectSqlUtils.timingTrace(doTrace, queryText, start, end);
        List<Object[]> list = MetastoreDirectSqlUtils.ensureList(qResult);
        for (Object[] row : list) {
            String tblName = (String) row[0];
            String partName = (String) row[1];
            ColumnStatisticsObj colStatObj = prepareCSObj(row, 2);
            colStatsForDB.add(new ColStatsObjWithSourceInfo(colStatObj, catName, dbName, tblName, partName));
            Deadline.checkTimeout();
        }
    }
    return colStatsForDB;
}

Also used : ColStatsObjWithSourceInfo(org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ArrayList(java.util.ArrayList)

Aggregations

ColStatsObjWithSourceInfo (org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo)10 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)9 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)8 HashMap (java.util.HashMap)7 NumDistinctValueEstimator (org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator)6 ArrayList (java.util.ArrayList)2 LinkedList (java.util.LinkedList)1 List (java.util.List)1 BinaryColumnStatsData (org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)1 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)1 DateColumnStatsData (org.apache.hadoop.hive.metastore.api.DateColumnStatsData)1 DecimalColumnStatsData (org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData)1 DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)1 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)1 TimestampColumnStatsData (org.apache.hadoop.hive.metastore.api.TimestampColumnStatsData)1 ColumnStatsAggregator (org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator)1 DateColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector)1 DecimalColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector)1 DoubleColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector)1 LongColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector)1