use of org.apache.hive.common.util.BloomFilter in project hive by apache.
the class MetaStoreDirectSql method aggrColStatsForPartitions.
public AggrStats aggrColStatsForPartitions(String dbName, String tableName, List<String> partNames, List<String> colNames, boolean useDensityFunctionForNDVEstimation) throws MetaException {
if (colNames.isEmpty() || partNames.isEmpty()) {
LOG.debug("Columns is empty or partNames is empty : Short-circuiting stats eval");
// Nothing to aggregate
return new AggrStats(new ArrayList<ColumnStatisticsObj>(), 0);
}
long partsFound = 0;
List<ColumnStatisticsObj> colStatsList;
// Try to read from the cache first
if (isAggregateStatsCacheEnabled && (partNames.size() < aggrStatsCache.getMaxPartsPerCacheNode())) {
AggrColStats colStatsAggrCached;
List<ColumnStatisticsObj> colStatsAggrFromDB;
int maxPartsPerCacheNode = aggrStatsCache.getMaxPartsPerCacheNode();
float fpp = aggrStatsCache.getFalsePositiveProbability();
colStatsList = new ArrayList<ColumnStatisticsObj>();
// Bloom filter for the new node that we will eventually add to the cache
BloomFilter bloomFilter = createPartsBloomFilter(maxPartsPerCacheNode, fpp, partNames);
boolean computePartsFound = true;
for (String colName : colNames) {
// Check the cache first
colStatsAggrCached = aggrStatsCache.get(dbName, tableName, colName, partNames);
if (colStatsAggrCached != null) {
colStatsList.add(colStatsAggrCached.getColStats());
partsFound = colStatsAggrCached.getNumPartsCached();
} else {
if (computePartsFound) {
partsFound = partsFoundForPartitions(dbName, tableName, partNames, colNames);
computePartsFound = false;
}
List<String> colNamesForDB = new ArrayList<String>();
colNamesForDB.add(colName);
// Read aggregated stats for one column
colStatsAggrFromDB = columnStatisticsObjForPartitions(dbName, tableName, partNames, colNamesForDB, partsFound, useDensityFunctionForNDVEstimation);
if (!colStatsAggrFromDB.isEmpty()) {
ColumnStatisticsObj colStatsAggr = colStatsAggrFromDB.get(0);
colStatsList.add(colStatsAggr);
// Update the cache to add this new aggregate node
aggrStatsCache.add(dbName, tableName, colName, partsFound, colStatsAggr, bloomFilter);
}
}
}
} else {
partsFound = partsFoundForPartitions(dbName, tableName, partNames, colNames);
colStatsList = columnStatisticsObjForPartitions(dbName, tableName, partNames, colNames, partsFound, useDensityFunctionForNDVEstimation);
}
LOG.info("useDensityFunctionForNDVEstimation = " + useDensityFunctionForNDVEstimation + "\npartsFound = " + partsFound + "\nColumnStatisticsObj = " + Arrays.toString(colStatsList.toArray()));
return new AggrStats(colStatsList, partsFound);
}
use of org.apache.hive.common.util.BloomFilter in project hive by apache.
the class MetaStoreDirectSql method aggrColStatsForPartitions.
public AggrStats aggrColStatsForPartitions(String catName, String dbName, String tableName, List<String> partNames, List<String> colNames, String engine, boolean useDensityFunctionForNDVEstimation, double ndvTuner, boolean enableBitVector) throws MetaException {
if (colNames.isEmpty() || partNames.isEmpty()) {
LOG.debug("Columns is empty or partNames is empty : Short-circuiting stats eval");
// Nothing to aggregate
return new AggrStats(Collections.<ColumnStatisticsObj>emptyList(), 0);
}
long partsFound = 0;
List<ColumnStatisticsObj> colStatsList;
// Try to read from the cache first
if (isAggregateStatsCacheEnabled && (partNames.size() < aggrStatsCache.getMaxPartsPerCacheNode())) {
AggrColStats colStatsAggrCached;
List<ColumnStatisticsObj> colStatsAggrFromDB;
int maxPartsPerCacheNode = aggrStatsCache.getMaxPartsPerCacheNode();
double fpp = aggrStatsCache.getFalsePositiveProbability();
colStatsList = new ArrayList<ColumnStatisticsObj>();
// Bloom filter for the new node that we will eventually add to the cache
BloomFilter bloomFilter = createPartsBloomFilter(maxPartsPerCacheNode, fpp, partNames);
boolean computePartsFound = true;
for (String colName : colNames) {
// Check the cache first
colStatsAggrCached = aggrStatsCache.get(catName, dbName, tableName, colName, partNames);
if (colStatsAggrCached != null) {
colStatsList.add(colStatsAggrCached.getColStats());
partsFound = colStatsAggrCached.getNumPartsCached();
} else {
if (computePartsFound) {
partsFound = partsFoundForPartitions(catName, dbName, tableName, partNames, colNames, engine);
computePartsFound = false;
}
List<String> colNamesForDB = new ArrayList<>();
colNamesForDB.add(colName);
// Read aggregated stats for one column
colStatsAggrFromDB = columnStatisticsObjForPartitions(catName, dbName, tableName, partNames, colNamesForDB, engine, partsFound, useDensityFunctionForNDVEstimation, ndvTuner, enableBitVector);
if (!colStatsAggrFromDB.isEmpty()) {
ColumnStatisticsObj colStatsAggr = colStatsAggrFromDB.get(0);
colStatsList.add(colStatsAggr);
// Update the cache to add this new aggregate node
aggrStatsCache.add(catName, dbName, tableName, colName, partsFound, colStatsAggr, bloomFilter);
}
}
}
} else {
partsFound = partsFoundForPartitions(catName, dbName, tableName, partNames, colNames, engine);
colStatsList = columnStatisticsObjForPartitions(catName, dbName, tableName, partNames, colNames, engine, partsFound, useDensityFunctionForNDVEstimation, ndvTuner, enableBitVector);
}
LOG.debug("useDensityFunctionForNDVEstimation = " + useDensityFunctionForNDVEstimation + "\npartsFound = " + partsFound + "\nColumnStatisticsObj = " + Arrays.toString(colStatsList.toArray()));
return new AggrStats(colStatsList, partsFound);
}
use of org.apache.hive.common.util.BloomFilter in project hive by apache.
the class TestAggregateStatsCache method testAddGetWithVariance.
@Test
public void testAddGetWithVariance() throws Exception {
// Partnames: [tab1part1...tab1part9]
List<String> partNames = preparePartNames(tables.get(0), 1, 9);
// Prepare the bloom filter
BloomFilter bloomFilter = prepareBloomFilter(partNames);
// Add a dummy aggregate stats object for the above parts (part1...part9) of tab1 for col1
String tblName = tables.get(0);
String colName = tabCols.get(0);
int highVal = 100, lowVal = 10, numDVs = 50, numNulls = 5;
// We'll treat this as the aggregate col stats for part1...part9 of tab1, col1
ColumnStatisticsObj aggrColStats = getDummyLongColStat(colName, highVal, lowVal, numDVs, numNulls);
// Now add to cache
cache.add(DEFAULT_CATALOG_NAME, DB_NAME, tblName, colName, 10, aggrColStats, bloomFilter);
// Now prepare partnames with only 5 partitions: [tab1part1...tab1part5]
partNames = preparePartNames(tables.get(0), 1, 5);
// This get should fail because its variance ((10-5)/5) is way past MAX_VARIANCE (0.5)
AggrColStats aggrStatsCached = cache.get(DEFAULT_CATALOG_NAME, DB_NAME, tblName, colName, partNames);
Assert.assertNull(aggrStatsCached);
// Now prepare partnames with 10 partitions: [tab1part11...tab1part20], but with no overlap
partNames = preparePartNames(tables.get(0), 11, 20);
// This get should fail because its variance ((10-0)/10) is way past MAX_VARIANCE (0.5)
aggrStatsCached = cache.get(DEFAULT_CATALOG_NAME, DB_NAME, tblName, colName, partNames);
Assert.assertNull(aggrStatsCached);
// Now prepare partnames with 9 partitions: [tab1part1...tab1part8], which are contained in the
// object that we added to the cache
partNames = preparePartNames(tables.get(0), 1, 8);
// This get should succeed because its variance ((10-9)/9) is within past MAX_VARIANCE (0.5)
aggrStatsCached = cache.get(DEFAULT_CATALOG_NAME, DB_NAME, tblName, colName, partNames);
Assert.assertNotNull(aggrStatsCached);
ColumnStatisticsObj aggrColStatsCached = aggrStatsCached.getColStats();
Assert.assertEquals(aggrColStats, aggrColStatsCached);
}
use of org.apache.hive.common.util.BloomFilter in project hive by apache.
the class TestAggregateStatsCache method testTimeToLive.
@Test
public void testTimeToLive() throws Exception {
// Add a dummy node to cache
// Partnames: [tab1part1...tab1part9]
List<String> partNames = preparePartNames(tables.get(0), 1, 9);
// Prepare the bloom filter
BloomFilter bloomFilter = prepareBloomFilter(partNames);
// Add a dummy aggregate stats object for the above parts (part1...part9) of tab1 for col1
String tblName = tables.get(0);
String colName = tabCols.get(0);
int highVal = 100, lowVal = 10, numDVs = 50, numNulls = 5;
// We'll treat this as the aggregate col stats for part1...part9 of tab1, col1
ColumnStatisticsObj aggrColStats = getDummyLongColStat(colName, highVal, lowVal, numDVs, numNulls);
// Now add to cache
cache.add(DEFAULT_CATALOG_NAME, DB_NAME, tblName, colName, 10, aggrColStats, bloomFilter);
// Sleep for 3 seconds
Thread.sleep(3000);
// Get should fail now (since TTL is 2s) and we've snoozed for 3 seconds
AggrColStats aggrStatsCached = cache.get(DEFAULT_CATALOG_NAME, DB_NAME, tblName, colName, partNames);
Assert.assertNull(aggrStatsCached);
}
use of org.apache.hive.common.util.BloomFilter in project hive by apache.
the class HBaseReadWrite method putAggregatedStats.
/**
* Put aggregated stats Only intended for use by
* {@link org.apache.hadoop.hive.metastore.hbase.StatsCache}. Others should not call directly
* but should call StatsCache.put instead.
* @param key The md5 hash associated with this partition set
* @param dbName Database these partitions are in
* @param tableName Table these partitions are in
* @param partNames Partition names
* @param colName Column stats are for
* @param stats Stats
* @throws IOException
*/
void putAggregatedStats(byte[] key, String dbName, String tableName, List<String> partNames, String colName, AggrStats stats) throws IOException {
// Serialize the part names
List<String> protoNames = new ArrayList<>(partNames.size() + 3);
protoNames.add(dbName);
protoNames.add(tableName);
protoNames.add(colName);
protoNames.addAll(partNames);
// Build a bloom Filter for these partitions
BloomFilter bloom = new BloomFilter(partNames.size(), STATS_BF_ERROR_RATE);
for (String partName : partNames) {
bloom.add(partName.getBytes(HBaseUtils.ENCODING));
}
byte[] serializedFilter = HBaseUtils.serializeBloomFilter(dbName, tableName, bloom);
byte[] serializedStats = HBaseUtils.serializeAggrStats(stats);
store(AGGR_STATS_TABLE, key, CATALOG_CF, new byte[][] { AGGR_STATS_BLOOM_COL, AGGR_STATS_STATS_COL }, new byte[][] { serializedFilter, serializedStats });
}
Aggregations