use of org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator in project hive by apache.
the class MetaStoreUtils method aggrPartitionStats.
public static List<ColumnStatisticsObj> aggrPartitionStats(Map<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> colStatsMap, final List<String> partNames, final boolean areAllPartsFound, final boolean useDensityFunctionForNDVEstimation, final double ndvTuner) throws MetaException {
List<ColumnStatisticsObj> aggrColStatObjs = new ArrayList<ColumnStatisticsObj>();
int numProcessors = Runtime.getRuntime().availableProcessors();
final ExecutorService pool = Executors.newFixedThreadPool(Math.min(colStatsMap.size(), numProcessors), new ThreadFactoryBuilder().setDaemon(true).setNameFormat("aggr-col-stats-%d").build());
final List<Future<ColumnStatisticsObj>> futures = Lists.newLinkedList();
LOG.debug("Aggregating column stats. Threads used: {}", Math.min(colStatsMap.size(), numProcessors));
long start = System.currentTimeMillis();
for (final Entry<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> entry : colStatsMap.entrySet()) {
futures.add(pool.submit(new Callable<ColumnStatisticsObj>() {
@Override
public ColumnStatisticsObj call() throws MetaException {
List<ColStatsObjWithSourceInfo> colStatWithSourceInfo = entry.getValue();
ColumnStatsAggregator aggregator = entry.getKey();
try {
ColumnStatisticsObj statsObj = aggregator.aggregate(colStatWithSourceInfo, partNames, areAllPartsFound);
return statsObj;
} catch (MetaException e) {
LOG.debug(e.getMessage());
throw e;
}
}
}));
}
pool.shutdown();
if (!futures.isEmpty()) {
for (Future<ColumnStatisticsObj> future : futures) {
try {
if (future.get() != null) {
aggrColStatObjs.add(future.get());
}
} catch (InterruptedException | ExecutionException e) {
LOG.debug(e.getMessage());
pool.shutdownNow();
throw new MetaException(e.toString());
}
}
}
LOG.debug("Time for aggr col stats in seconds: {} Threads used: {}", ((System.currentTimeMillis() - (double) start)) / 1000, Math.min(colStatsMap.size(), numProcessors));
return aggrColStatObjs;
}
use of org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator in project hive by apache.
the class MetaStoreServerUtils method aggrPartitionStats.
public static List<ColumnStatisticsObj> aggrPartitionStats(Map<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> colStatsMap, final List<String> partNames, final boolean areAllPartsFound, final boolean useDensityFunctionForNDVEstimation, final double ndvTuner) throws MetaException {
List<ColumnStatisticsObj> aggrColStatObjs = new ArrayList<ColumnStatisticsObj>();
int numProcessors = Runtime.getRuntime().availableProcessors();
final ExecutorService pool = Executors.newFixedThreadPool(Math.min(colStatsMap.size(), numProcessors), new ThreadFactoryBuilder().setDaemon(true).setNameFormat("aggr-col-stats-%d").build());
final List<Future<ColumnStatisticsObj>> futures = Lists.newLinkedList();
LOG.debug("Aggregating column stats. Threads used: {}", Math.min(colStatsMap.size(), numProcessors));
long start = System.currentTimeMillis();
for (final Map.Entry<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> entry : colStatsMap.entrySet()) {
futures.add(pool.submit(new Callable<ColumnStatisticsObj>() {
@Override
public ColumnStatisticsObj call() throws MetaException {
List<ColStatsObjWithSourceInfo> colStatWithSourceInfo = entry.getValue();
ColumnStatsAggregator aggregator = entry.getKey();
try {
ColumnStatisticsObj statsObj = aggregator.aggregate(colStatWithSourceInfo, partNames, areAllPartsFound);
return statsObj;
} catch (MetaException e) {
LOG.debug(e.getMessage());
throw e;
}
}
}));
}
pool.shutdown();
if (!futures.isEmpty()) {
for (Future<ColumnStatisticsObj> future : futures) {
try {
if (future.get() != null) {
aggrColStatObjs.add(future.get());
}
} catch (InterruptedException | ExecutionException e) {
LOG.debug(e.getMessage());
pool.shutdownNow();
throw new MetaException(e.toString());
}
}
}
LOG.debug("Time for aggr col stats in seconds: {} Threads used: {}", ((System.currentTimeMillis() - (double) start)) / 1000, Math.min(colStatsMap.size(), numProcessors));
return aggrColStatObjs;
}
use of org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator in project hive by apache.
the class MetaStoreUtils method aggrPartitionStats.
// Given a list of partStats, this function will give you an aggr stats
public static List<ColumnStatisticsObj> aggrPartitionStats(List<ColumnStatistics> partStats, String dbName, String tableName, List<String> partNames, List<String> colNames, boolean areAllPartsFound, boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException {
Map<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> colStatsMap = new HashMap<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>>();
// Group stats by colName for each partition
Map<String, ColumnStatsAggregator> aliasToAggregator = new HashMap<String, ColumnStatsAggregator>();
for (ColumnStatistics css : partStats) {
List<ColumnStatisticsObj> objs = css.getStatsObj();
for (ColumnStatisticsObj obj : objs) {
String partName = css.getStatsDesc().getPartName();
if (aliasToAggregator.get(obj.getColName()) == null) {
aliasToAggregator.put(obj.getColName(), ColumnStatsAggregatorFactory.getColumnStatsAggregator(obj.getStatsData().getSetField(), useDensityFunctionForNDVEstimation, ndvTuner));
colStatsMap.put(aliasToAggregator.get(obj.getColName()), new ArrayList<ColStatsObjWithSourceInfo>());
}
colStatsMap.get(aliasToAggregator.get(obj.getColName())).add(new ColStatsObjWithSourceInfo(obj, dbName, tableName, partName));
}
}
if (colStatsMap.size() < 1) {
LOG.debug("No stats data found for: dbName= {}, tblName= {}, partNames= {}, colNames= {}", dbName, tableName, partNames, colNames);
return new ArrayList<ColumnStatisticsObj>();
}
return aggrPartitionStats(colStatsMap, partNames, areAllPartsFound, useDensityFunctionForNDVEstimation, ndvTuner);
}
use of org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator in project hive by apache.
the class CachedStore method mergeColStatsForPartitions.
private MergedColumnStatsForPartitions mergeColStatsForPartitions(String dbName, String tblName, List<String> partNames, List<String> colNames, SharedCache sharedCache) throws MetaException {
final boolean useDensityFunctionForNDVEstimation = MetastoreConf.getBoolVar(getConf(), ConfVars.STATS_NDV_DENSITY_FUNCTION);
final double ndvTuner = MetastoreConf.getDoubleVar(getConf(), ConfVars.STATS_NDV_TUNER);
Map<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> colStatsMap = new HashMap<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>>();
boolean areAllPartsFound = true;
long partsFound = 0;
for (String colName : colNames) {
long partsFoundForColumn = 0;
ColumnStatsAggregator colStatsAggregator = null;
List<ColStatsObjWithSourceInfo> colStatsWithPartInfoList = new ArrayList<ColStatsObjWithSourceInfo>();
for (String partName : partNames) {
ColumnStatisticsObj colStatsForPart = sharedCache.getPartitionColStatsFromCache(dbName, tblName, partNameToVals(partName), colName);
if (colStatsForPart != null) {
ColStatsObjWithSourceInfo colStatsWithPartInfo = new ColStatsObjWithSourceInfo(colStatsForPart, dbName, tblName, partName);
colStatsWithPartInfoList.add(colStatsWithPartInfo);
if (colStatsAggregator == null) {
colStatsAggregator = ColumnStatsAggregatorFactory.getColumnStatsAggregator(colStatsForPart.getStatsData().getSetField(), useDensityFunctionForNDVEstimation, ndvTuner);
}
partsFoundForColumn++;
} else {
LOG.debug("Stats not found in CachedStore for: dbName={} tblName={} partName={} colName={}", dbName, tblName, partName, colName);
}
}
if (colStatsWithPartInfoList.size() > 0) {
colStatsMap.put(colStatsAggregator, colStatsWithPartInfoList);
}
if (partsFoundForColumn == partNames.size()) {
partsFound++;
}
if (colStatsMap.size() < 1) {
LOG.debug("No stats data found for: dbName={} tblName= {} partNames= {} colNames= ", dbName, tblName, partNames, colNames);
return new MergedColumnStatsForPartitions(new ArrayList<ColumnStatisticsObj>(), 0);
}
}
// itself will tell whether bitvector is null or not and aggr logic can automatically apply.
return new MergedColumnStatsForPartitions(MetaStoreUtils.aggrPartitionStats(colStatsMap, partNames, areAllPartsFound, useDensityFunctionForNDVEstimation, ndvTuner), partsFound);
}
use of org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator in project hive by apache.
the class CachedStore method mergeColStatsForPartitions.
private MergedColumnStatsForPartitions mergeColStatsForPartitions(String catName, String dbName, String tblName, List<String> partNames, List<String> colNames, SharedCache sharedCache, StatsType type, String writeIdList) throws MetaException {
final boolean useDensityFunctionForNDVEstimation = MetastoreConf.getBoolVar(getConf(), ConfVars.STATS_NDV_DENSITY_FUNCTION);
final double ndvTuner = MetastoreConf.getDoubleVar(getConf(), ConfVars.STATS_NDV_TUNER);
Map<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> colStatsMap = new HashMap<>();
long partsFound = partNames.size();
Map<List<String>, Long> partNameToWriteId = writeIdList != null ? new HashMap<>() : null;
for (String colName : colNames) {
long partsFoundForColumn = 0;
ColumnStatsAggregator colStatsAggregator = null;
List<ColStatsObjWithSourceInfo> colStatsWithPartInfoList = new ArrayList<>();
for (String partName : partNames) {
List<String> partValue = partNameToVals(partName);
// There are three possible result from getPartitionColStatsFromCache.
// 1. The partition has valid stats and thus colStatsWriteId returned is valid non-null value
// 2. Partition stat is missing from cache and thus colStatsWriteId returned is non-null but colstat
// info in it is null. In this case we just ignore the partition from aggregate calculation to keep
// the behavior same as object store.
// 3. Partition is missing or its stat is updated by live(not yet committed) or aborted txn. In this case,
// colStatsWriteId is null. Thus null is returned to keep the behavior same as object store.
SharedCache.ColumStatsWithWriteId colStatsWriteId = sharedCache.getPartitionColStatsFromCache(catName, dbName, tblName, partValue, colName, writeIdList);
if (colStatsWriteId == null) {
return null;
}
if (colStatsWriteId.getColumnStatisticsObj() != null) {
ColumnStatisticsObj colStatsForPart = colStatsWriteId.getColumnStatisticsObj();
if (partNameToWriteId != null) {
partNameToWriteId.put(partValue, colStatsWriteId.getWriteId());
}
ColStatsObjWithSourceInfo colStatsWithPartInfo = new ColStatsObjWithSourceInfo(colStatsForPart, catName, dbName, tblName, partName);
colStatsWithPartInfoList.add(colStatsWithPartInfo);
if (colStatsAggregator == null) {
colStatsAggregator = ColumnStatsAggregatorFactory.getColumnStatsAggregator(colStatsForPart.getStatsData().getSetField(), useDensityFunctionForNDVEstimation, ndvTuner);
}
partsFoundForColumn++;
} else {
LOG.debug("Stats not found in CachedStore for: dbName={} tblName={} partName={} colName={}", dbName, tblName, partName, colName);
}
}
if (colStatsWithPartInfoList.size() > 0) {
colStatsMap.put(colStatsAggregator, colStatsWithPartInfoList);
}
// which stats for all columns are present in the cache.
if (partsFoundForColumn < partsFound) {
partsFound = partsFoundForColumn;
}
if (colStatsMap.size() < 1) {
LOG.debug("No stats data found for: dbName={} tblName= {} partNames= {} colNames= ", dbName, tblName, partNames, colNames);
// trigger the lookup in the raw store and we will end up with missing stats.
return new MergedColumnStatsForPartitions(new ArrayList<ColumnStatisticsObj>(), 0);
}
}
// Note that enableBitVector does not apply here because ColumnStatisticsObj
// itself will tell whether bitvector is null or not and aggr logic can automatically apply.
List<ColumnStatisticsObj> colAggrStats = MetaStoreServerUtils.aggrPartitionStats(colStatsMap, partNames, partsFound == partNames.size(), useDensityFunctionForNDVEstimation, ndvTuner);
if (canUseEvents) {
if (type == StatsType.ALL) {
sharedCache.refreshAggregateStatsInCache(StringUtils.normalizeIdentifier(catName), StringUtils.normalizeIdentifier(dbName), StringUtils.normalizeIdentifier(tblName), new AggrStats(colAggrStats, partsFound), null, partNameToWriteId);
} else if (type == StatsType.ALLBUTDEFAULT) {
sharedCache.refreshAggregateStatsInCache(StringUtils.normalizeIdentifier(catName), StringUtils.normalizeIdentifier(dbName), StringUtils.normalizeIdentifier(tblName), null, new AggrStats(colAggrStats, partsFound), partNameToWriteId);
}
}
return new MergedColumnStatsForPartitions(colAggrStats, partsFound);
}
Aggregations