Search in sources :

Example 1 with ColumnStatsAggregator

use of org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator in project hive by apache.

the class MetaStoreUtils method aggrPartitionStats.

public static List<ColumnStatisticsObj> aggrPartitionStats(Map<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> colStatsMap, final List<String> partNames, final boolean areAllPartsFound, final boolean useDensityFunctionForNDVEstimation, final double ndvTuner) throws MetaException {
    List<ColumnStatisticsObj> aggrColStatObjs = new ArrayList<ColumnStatisticsObj>();
    int numProcessors = Runtime.getRuntime().availableProcessors();
    final ExecutorService pool = Executors.newFixedThreadPool(Math.min(colStatsMap.size(), numProcessors), new ThreadFactoryBuilder().setDaemon(true).setNameFormat("aggr-col-stats-%d").build());
    final List<Future<ColumnStatisticsObj>> futures = Lists.newLinkedList();
    LOG.debug("Aggregating column stats. Threads used: {}", Math.min(colStatsMap.size(), numProcessors));
    long start = System.currentTimeMillis();
    for (final Entry<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> entry : colStatsMap.entrySet()) {
        futures.add(pool.submit(new Callable<ColumnStatisticsObj>() {

            @Override
            public ColumnStatisticsObj call() throws MetaException {
                List<ColStatsObjWithSourceInfo> colStatWithSourceInfo = entry.getValue();
                ColumnStatsAggregator aggregator = entry.getKey();
                try {
                    ColumnStatisticsObj statsObj = aggregator.aggregate(colStatWithSourceInfo, partNames, areAllPartsFound);
                    return statsObj;
                } catch (MetaException e) {
                    LOG.debug(e.getMessage());
                    throw e;
                }
            }
        }));
    }
    pool.shutdown();
    if (!futures.isEmpty()) {
        for (Future<ColumnStatisticsObj> future : futures) {
            try {
                if (future.get() != null) {
                    aggrColStatObjs.add(future.get());
                }
            } catch (InterruptedException | ExecutionException e) {
                LOG.debug(e.getMessage());
                pool.shutdownNow();
                throw new MetaException(e.toString());
            }
        }
    }
    LOG.debug("Time for aggr col stats in seconds: {} Threads used: {}", ((System.currentTimeMillis() - (double) start)) / 1000, Math.min(colStatsMap.size(), numProcessors));
    return aggrColStatObjs;
}
Also used : ArrayList(java.util.ArrayList) Callable(java.util.concurrent.Callable) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatsAggregator(org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator) ExecutorService(java.util.concurrent.ExecutorService) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) Future(java.util.concurrent.Future) MachineList(org.apache.hadoop.util.MachineList) List(java.util.List) ArrayList(java.util.ArrayList) ExecutionException(java.util.concurrent.ExecutionException) MetaException(org.apache.hadoop.hive.metastore.api.MetaException)

Example 2 with ColumnStatsAggregator

use of org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator in project hive by apache.

the class MetaStoreServerUtils method aggrPartitionStats.

public static List<ColumnStatisticsObj> aggrPartitionStats(Map<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> colStatsMap, final List<String> partNames, final boolean areAllPartsFound, final boolean useDensityFunctionForNDVEstimation, final double ndvTuner) throws MetaException {
    List<ColumnStatisticsObj> aggrColStatObjs = new ArrayList<ColumnStatisticsObj>();
    int numProcessors = Runtime.getRuntime().availableProcessors();
    final ExecutorService pool = Executors.newFixedThreadPool(Math.min(colStatsMap.size(), numProcessors), new ThreadFactoryBuilder().setDaemon(true).setNameFormat("aggr-col-stats-%d").build());
    final List<Future<ColumnStatisticsObj>> futures = Lists.newLinkedList();
    LOG.debug("Aggregating column stats. Threads used: {}", Math.min(colStatsMap.size(), numProcessors));
    long start = System.currentTimeMillis();
    for (final Map.Entry<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> entry : colStatsMap.entrySet()) {
        futures.add(pool.submit(new Callable<ColumnStatisticsObj>() {

            @Override
            public ColumnStatisticsObj call() throws MetaException {
                List<ColStatsObjWithSourceInfo> colStatWithSourceInfo = entry.getValue();
                ColumnStatsAggregator aggregator = entry.getKey();
                try {
                    ColumnStatisticsObj statsObj = aggregator.aggregate(colStatWithSourceInfo, partNames, areAllPartsFound);
                    return statsObj;
                } catch (MetaException e) {
                    LOG.debug(e.getMessage());
                    throw e;
                }
            }
        }));
    }
    pool.shutdown();
    if (!futures.isEmpty()) {
        for (Future<ColumnStatisticsObj> future : futures) {
            try {
                if (future.get() != null) {
                    aggrColStatObjs.add(future.get());
                }
            } catch (InterruptedException | ExecutionException e) {
                LOG.debug(e.getMessage());
                pool.shutdownNow();
                throw new MetaException(e.toString());
            }
        }
    }
    LOG.debug("Time for aggr col stats in seconds: {} Threads used: {}", ((System.currentTimeMillis() - (double) start)) / 1000, Math.min(colStatsMap.size(), numProcessors));
    return aggrColStatObjs;
}
Also used : ArrayList(java.util.ArrayList) Callable(java.util.concurrent.Callable) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatsAggregator(org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator) ExecutorService(java.util.concurrent.ExecutorService) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) Future(java.util.concurrent.Future) MachineList(org.apache.hadoop.util.MachineList) List(java.util.List) ArrayList(java.util.ArrayList) ExecutionException(java.util.concurrent.ExecutionException) Map(java.util.Map) SortedMap(java.util.SortedMap) HashMap(java.util.HashMap) TreeMap(java.util.TreeMap) MetaException(org.apache.hadoop.hive.metastore.api.MetaException)

Example 3 with ColumnStatsAggregator

use of org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator in project hive by apache.

the class MetaStoreUtils method aggrPartitionStats.

// Given a list of partStats, this function will give you an aggr stats
public static List<ColumnStatisticsObj> aggrPartitionStats(List<ColumnStatistics> partStats, String dbName, String tableName, List<String> partNames, List<String> colNames, boolean areAllPartsFound, boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException {
    Map<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> colStatsMap = new HashMap<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>>();
    // Group stats by colName for each partition
    Map<String, ColumnStatsAggregator> aliasToAggregator = new HashMap<String, ColumnStatsAggregator>();
    for (ColumnStatistics css : partStats) {
        List<ColumnStatisticsObj> objs = css.getStatsObj();
        for (ColumnStatisticsObj obj : objs) {
            String partName = css.getStatsDesc().getPartName();
            if (aliasToAggregator.get(obj.getColName()) == null) {
                aliasToAggregator.put(obj.getColName(), ColumnStatsAggregatorFactory.getColumnStatsAggregator(obj.getStatsData().getSetField(), useDensityFunctionForNDVEstimation, ndvTuner));
                colStatsMap.put(aliasToAggregator.get(obj.getColName()), new ArrayList<ColStatsObjWithSourceInfo>());
            }
            colStatsMap.get(aliasToAggregator.get(obj.getColName())).add(new ColStatsObjWithSourceInfo(obj, dbName, tableName, partName));
        }
    }
    if (colStatsMap.size() < 1) {
        LOG.debug("No stats data found for: dbName= {},  tblName= {}, partNames= {}, colNames= {}", dbName, tableName, partNames, colNames);
        return new ArrayList<ColumnStatisticsObj>();
    }
    return aggrPartitionStats(colStatsMap, partNames, areAllPartsFound, useDensityFunctionForNDVEstimation, ndvTuner);
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatsAggregator(org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) MachineList(org.apache.hadoop.util.MachineList) List(java.util.List) ArrayList(java.util.ArrayList)

Example 4 with ColumnStatsAggregator

use of org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator in project hive by apache.

the class CachedStore method mergeColStatsForPartitions.

private MergedColumnStatsForPartitions mergeColStatsForPartitions(String dbName, String tblName, List<String> partNames, List<String> colNames, SharedCache sharedCache) throws MetaException {
    final boolean useDensityFunctionForNDVEstimation = MetastoreConf.getBoolVar(getConf(), ConfVars.STATS_NDV_DENSITY_FUNCTION);
    final double ndvTuner = MetastoreConf.getDoubleVar(getConf(), ConfVars.STATS_NDV_TUNER);
    Map<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> colStatsMap = new HashMap<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>>();
    boolean areAllPartsFound = true;
    long partsFound = 0;
    for (String colName : colNames) {
        long partsFoundForColumn = 0;
        ColumnStatsAggregator colStatsAggregator = null;
        List<ColStatsObjWithSourceInfo> colStatsWithPartInfoList = new ArrayList<ColStatsObjWithSourceInfo>();
        for (String partName : partNames) {
            ColumnStatisticsObj colStatsForPart = sharedCache.getPartitionColStatsFromCache(dbName, tblName, partNameToVals(partName), colName);
            if (colStatsForPart != null) {
                ColStatsObjWithSourceInfo colStatsWithPartInfo = new ColStatsObjWithSourceInfo(colStatsForPart, dbName, tblName, partName);
                colStatsWithPartInfoList.add(colStatsWithPartInfo);
                if (colStatsAggregator == null) {
                    colStatsAggregator = ColumnStatsAggregatorFactory.getColumnStatsAggregator(colStatsForPart.getStatsData().getSetField(), useDensityFunctionForNDVEstimation, ndvTuner);
                }
                partsFoundForColumn++;
            } else {
                LOG.debug("Stats not found in CachedStore for: dbName={} tblName={} partName={} colName={}", dbName, tblName, partName, colName);
            }
        }
        if (colStatsWithPartInfoList.size() > 0) {
            colStatsMap.put(colStatsAggregator, colStatsWithPartInfoList);
        }
        if (partsFoundForColumn == partNames.size()) {
            partsFound++;
        }
        if (colStatsMap.size() < 1) {
            LOG.debug("No stats data found for: dbName={} tblName= {} partNames= {} colNames= ", dbName, tblName, partNames, colNames);
            return new MergedColumnStatsForPartitions(new ArrayList<ColumnStatisticsObj>(), 0);
        }
    }
    // itself will tell whether bitvector is null or not and aggr logic can automatically apply.
    return new MergedColumnStatsForPartitions(MetaStoreUtils.aggrPartitionStats(colStatsMap, partNames, areAllPartsFound, useDensityFunctionForNDVEstimation, ndvTuner), partsFound);
}
Also used : ColStatsObjWithSourceInfo(org.apache.hadoop.hive.metastore.utils.MetaStoreUtils.ColStatsObjWithSourceInfo) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatsAggregator(org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator) List(java.util.List) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList)

Example 5 with ColumnStatsAggregator

use of org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator in project hive by apache.

the class CachedStore method mergeColStatsForPartitions.

private MergedColumnStatsForPartitions mergeColStatsForPartitions(String catName, String dbName, String tblName, List<String> partNames, List<String> colNames, SharedCache sharedCache, StatsType type, String writeIdList) throws MetaException {
    final boolean useDensityFunctionForNDVEstimation = MetastoreConf.getBoolVar(getConf(), ConfVars.STATS_NDV_DENSITY_FUNCTION);
    final double ndvTuner = MetastoreConf.getDoubleVar(getConf(), ConfVars.STATS_NDV_TUNER);
    Map<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> colStatsMap = new HashMap<>();
    long partsFound = partNames.size();
    Map<List<String>, Long> partNameToWriteId = writeIdList != null ? new HashMap<>() : null;
    for (String colName : colNames) {
        long partsFoundForColumn = 0;
        ColumnStatsAggregator colStatsAggregator = null;
        List<ColStatsObjWithSourceInfo> colStatsWithPartInfoList = new ArrayList<>();
        for (String partName : partNames) {
            List<String> partValue = partNameToVals(partName);
            // There are three possible result from getPartitionColStatsFromCache.
            // 1. The partition has valid stats and thus colStatsWriteId returned is valid non-null value
            // 2. Partition stat is missing from cache and thus colStatsWriteId returned is non-null but colstat
            // info in it is null. In this case we just ignore the partition from aggregate calculation to keep
            // the behavior same as object store.
            // 3. Partition is missing or its stat is updated by live(not yet committed) or aborted txn. In this case,
            // colStatsWriteId is null. Thus null is returned to keep the behavior same as object store.
            SharedCache.ColumStatsWithWriteId colStatsWriteId = sharedCache.getPartitionColStatsFromCache(catName, dbName, tblName, partValue, colName, writeIdList);
            if (colStatsWriteId == null) {
                return null;
            }
            if (colStatsWriteId.getColumnStatisticsObj() != null) {
                ColumnStatisticsObj colStatsForPart = colStatsWriteId.getColumnStatisticsObj();
                if (partNameToWriteId != null) {
                    partNameToWriteId.put(partValue, colStatsWriteId.getWriteId());
                }
                ColStatsObjWithSourceInfo colStatsWithPartInfo = new ColStatsObjWithSourceInfo(colStatsForPart, catName, dbName, tblName, partName);
                colStatsWithPartInfoList.add(colStatsWithPartInfo);
                if (colStatsAggregator == null) {
                    colStatsAggregator = ColumnStatsAggregatorFactory.getColumnStatsAggregator(colStatsForPart.getStatsData().getSetField(), useDensityFunctionForNDVEstimation, ndvTuner);
                }
                partsFoundForColumn++;
            } else {
                LOG.debug("Stats not found in CachedStore for: dbName={} tblName={} partName={} colName={}", dbName, tblName, partName, colName);
            }
        }
        if (colStatsWithPartInfoList.size() > 0) {
            colStatsMap.put(colStatsAggregator, colStatsWithPartInfoList);
        }
        // which stats for all columns are present in the cache.
        if (partsFoundForColumn < partsFound) {
            partsFound = partsFoundForColumn;
        }
        if (colStatsMap.size() < 1) {
            LOG.debug("No stats data found for: dbName={} tblName= {} partNames= {} colNames= ", dbName, tblName, partNames, colNames);
            // trigger the lookup in the raw store and we will end up with missing stats.
            return new MergedColumnStatsForPartitions(new ArrayList<ColumnStatisticsObj>(), 0);
        }
    }
    // Note that enableBitVector does not apply here because ColumnStatisticsObj
    // itself will tell whether bitvector is null or not and aggr logic can automatically apply.
    List<ColumnStatisticsObj> colAggrStats = MetaStoreServerUtils.aggrPartitionStats(colStatsMap, partNames, partsFound == partNames.size(), useDensityFunctionForNDVEstimation, ndvTuner);
    if (canUseEvents) {
        if (type == StatsType.ALL) {
            sharedCache.refreshAggregateStatsInCache(StringUtils.normalizeIdentifier(catName), StringUtils.normalizeIdentifier(dbName), StringUtils.normalizeIdentifier(tblName), new AggrStats(colAggrStats, partsFound), null, partNameToWriteId);
        } else if (type == StatsType.ALLBUTDEFAULT) {
            sharedCache.refreshAggregateStatsInCache(StringUtils.normalizeIdentifier(catName), StringUtils.normalizeIdentifier(dbName), StringUtils.normalizeIdentifier(tblName), null, new AggrStats(colAggrStats, partsFound), partNameToWriteId);
        }
    }
    return new MergedColumnStatsForPartitions(colAggrStats, partsFound);
}
Also used : ColStatsObjWithSourceInfo(org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ColumnStatsAggregator(org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator) List(java.util.List) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList)

Aggregations

ArrayList (java.util.ArrayList)6 List (java.util.List)6 ColumnStatsAggregator (org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator)6 HashMap (java.util.HashMap)5 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)5 MachineList (org.apache.hadoop.util.MachineList)4 ThreadFactoryBuilder (com.google.common.util.concurrent.ThreadFactoryBuilder)2 LinkedList (java.util.LinkedList)2 Callable (java.util.concurrent.Callable)2 ExecutionException (java.util.concurrent.ExecutionException)2 ExecutorService (java.util.concurrent.ExecutorService)2 Future (java.util.concurrent.Future)2 ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)2 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)2 Map (java.util.Map)1 SortedMap (java.util.SortedMap)1 TreeMap (java.util.TreeMap)1 ColStatsObjWithSourceInfo (org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo)1 ColStatsObjWithSourceInfo (org.apache.hadoop.hive.metastore.utils.MetaStoreUtils.ColStatsObjWithSourceInfo)1