Search in sources :

Example 1 with BloomFilter

use of org.apache.hive.common.util.BloomFilter in project hive by apache.

the class MetaStoreDirectSql method aggrColStatsForPartitions.

public AggrStats aggrColStatsForPartitions(String dbName, String tableName, List<String> partNames, List<String> colNames, boolean useDensityFunctionForNDVEstimation) throws MetaException {
    if (colNames.isEmpty() || partNames.isEmpty()) {
        LOG.debug("Columns is empty or partNames is empty : Short-circuiting stats eval");
        // Nothing to aggregate
        return new AggrStats(new ArrayList<ColumnStatisticsObj>(), 0);
    }
    long partsFound = 0;
    List<ColumnStatisticsObj> colStatsList;
    // Try to read from the cache first
    if (isAggregateStatsCacheEnabled && (partNames.size() < aggrStatsCache.getMaxPartsPerCacheNode())) {
        AggrColStats colStatsAggrCached;
        List<ColumnStatisticsObj> colStatsAggrFromDB;
        int maxPartsPerCacheNode = aggrStatsCache.getMaxPartsPerCacheNode();
        float fpp = aggrStatsCache.getFalsePositiveProbability();
        colStatsList = new ArrayList<ColumnStatisticsObj>();
        // Bloom filter for the new node that we will eventually add to the cache
        BloomFilter bloomFilter = createPartsBloomFilter(maxPartsPerCacheNode, fpp, partNames);
        boolean computePartsFound = true;
        for (String colName : colNames) {
            // Check the cache first
            colStatsAggrCached = aggrStatsCache.get(dbName, tableName, colName, partNames);
            if (colStatsAggrCached != null) {
                colStatsList.add(colStatsAggrCached.getColStats());
                partsFound = colStatsAggrCached.getNumPartsCached();
            } else {
                if (computePartsFound) {
                    partsFound = partsFoundForPartitions(dbName, tableName, partNames, colNames);
                    computePartsFound = false;
                }
                List<String> colNamesForDB = new ArrayList<String>();
                colNamesForDB.add(colName);
                // Read aggregated stats for one column
                colStatsAggrFromDB = columnStatisticsObjForPartitions(dbName, tableName, partNames, colNamesForDB, partsFound, useDensityFunctionForNDVEstimation);
                if (!colStatsAggrFromDB.isEmpty()) {
                    ColumnStatisticsObj colStatsAggr = colStatsAggrFromDB.get(0);
                    colStatsList.add(colStatsAggr);
                    // Update the cache to add this new aggregate node
                    aggrStatsCache.add(dbName, tableName, colName, partsFound, colStatsAggr, bloomFilter);
                }
            }
        }
    } else {
        partsFound = partsFoundForPartitions(dbName, tableName, partNames, colNames);
        colStatsList = columnStatisticsObjForPartitions(dbName, tableName, partNames, colNames, partsFound, useDensityFunctionForNDVEstimation);
    }
    LOG.info("useDensityFunctionForNDVEstimation = " + useDensityFunctionForNDVEstimation + "\npartsFound = " + partsFound + "\nColumnStatisticsObj = " + Arrays.toString(colStatsList.toArray()));
    return new AggrStats(colStatsList, partsFound);
}
Also used : AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) ArrayList(java.util.ArrayList) AggrColStats(org.apache.hadoop.hive.metastore.AggregateStatsCache.AggrColStats) MConstraint(org.apache.hadoop.hive.metastore.model.MConstraint) BloomFilter(org.apache.hive.common.util.BloomFilter) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)

Example 2 with BloomFilter

use of org.apache.hive.common.util.BloomFilter in project hive by apache.

the class MetaStoreDirectSql method aggrColStatsForPartitions.

public AggrStats aggrColStatsForPartitions(String catName, String dbName, String tableName, List<String> partNames, List<String> colNames, String engine, boolean useDensityFunctionForNDVEstimation, double ndvTuner, boolean enableBitVector) throws MetaException {
    if (colNames.isEmpty() || partNames.isEmpty()) {
        LOG.debug("Columns is empty or partNames is empty : Short-circuiting stats eval");
        // Nothing to aggregate
        return new AggrStats(Collections.<ColumnStatisticsObj>emptyList(), 0);
    }
    long partsFound = 0;
    List<ColumnStatisticsObj> colStatsList;
    // Try to read from the cache first
    if (isAggregateStatsCacheEnabled && (partNames.size() < aggrStatsCache.getMaxPartsPerCacheNode())) {
        AggrColStats colStatsAggrCached;
        List<ColumnStatisticsObj> colStatsAggrFromDB;
        int maxPartsPerCacheNode = aggrStatsCache.getMaxPartsPerCacheNode();
        double fpp = aggrStatsCache.getFalsePositiveProbability();
        colStatsList = new ArrayList<ColumnStatisticsObj>();
        // Bloom filter for the new node that we will eventually add to the cache
        BloomFilter bloomFilter = createPartsBloomFilter(maxPartsPerCacheNode, fpp, partNames);
        boolean computePartsFound = true;
        for (String colName : colNames) {
            // Check the cache first
            colStatsAggrCached = aggrStatsCache.get(catName, dbName, tableName, colName, partNames);
            if (colStatsAggrCached != null) {
                colStatsList.add(colStatsAggrCached.getColStats());
                partsFound = colStatsAggrCached.getNumPartsCached();
            } else {
                if (computePartsFound) {
                    partsFound = partsFoundForPartitions(catName, dbName, tableName, partNames, colNames, engine);
                    computePartsFound = false;
                }
                List<String> colNamesForDB = new ArrayList<>();
                colNamesForDB.add(colName);
                // Read aggregated stats for one column
                colStatsAggrFromDB = columnStatisticsObjForPartitions(catName, dbName, tableName, partNames, colNamesForDB, engine, partsFound, useDensityFunctionForNDVEstimation, ndvTuner, enableBitVector);
                if (!colStatsAggrFromDB.isEmpty()) {
                    ColumnStatisticsObj colStatsAggr = colStatsAggrFromDB.get(0);
                    colStatsList.add(colStatsAggr);
                    // Update the cache to add this new aggregate node
                    aggrStatsCache.add(catName, dbName, tableName, colName, partsFound, colStatsAggr, bloomFilter);
                }
            }
        }
    } else {
        partsFound = partsFoundForPartitions(catName, dbName, tableName, partNames, colNames, engine);
        colStatsList = columnStatisticsObjForPartitions(catName, dbName, tableName, partNames, colNames, engine, partsFound, useDensityFunctionForNDVEstimation, ndvTuner, enableBitVector);
    }
    LOG.debug("useDensityFunctionForNDVEstimation = " + useDensityFunctionForNDVEstimation + "\npartsFound = " + partsFound + "\nColumnStatisticsObj = " + Arrays.toString(colStatsList.toArray()));
    return new AggrStats(colStatsList, partsFound);
}
Also used : AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) ArrayList(java.util.ArrayList) AggrColStats(org.apache.hadoop.hive.metastore.AggregateStatsCache.AggrColStats) MConstraint(org.apache.hadoop.hive.metastore.model.MConstraint) SQLCheckConstraint(org.apache.hadoop.hive.metastore.api.SQLCheckConstraint) SQLNotNullConstraint(org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint) SQLUniqueConstraint(org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint) SQLDefaultConstraint(org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint) BloomFilter(org.apache.hive.common.util.BloomFilter) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)

Example 3 with BloomFilter

use of org.apache.hive.common.util.BloomFilter in project hive by apache.

the class TestAggregateStatsCache method testAddGetWithVariance.

@Test
public void testAddGetWithVariance() throws Exception {
    // Partnames: [tab1part1...tab1part9]
    List<String> partNames = preparePartNames(tables.get(0), 1, 9);
    // Prepare the bloom filter
    BloomFilter bloomFilter = prepareBloomFilter(partNames);
    // Add a dummy aggregate stats object for the above parts (part1...part9) of tab1 for col1
    String tblName = tables.get(0);
    String colName = tabCols.get(0);
    int highVal = 100, lowVal = 10, numDVs = 50, numNulls = 5;
    // We'll treat this as the aggregate col stats for part1...part9 of tab1, col1
    ColumnStatisticsObj aggrColStats = getDummyLongColStat(colName, highVal, lowVal, numDVs, numNulls);
    // Now add to cache
    cache.add(DEFAULT_CATALOG_NAME, DB_NAME, tblName, colName, 10, aggrColStats, bloomFilter);
    // Now prepare partnames with only 5 partitions: [tab1part1...tab1part5]
    partNames = preparePartNames(tables.get(0), 1, 5);
    // This get should fail because its variance ((10-5)/5) is way past MAX_VARIANCE (0.5)
    AggrColStats aggrStatsCached = cache.get(DEFAULT_CATALOG_NAME, DB_NAME, tblName, colName, partNames);
    Assert.assertNull(aggrStatsCached);
    // Now prepare partnames with 10 partitions: [tab1part11...tab1part20], but with no overlap
    partNames = preparePartNames(tables.get(0), 11, 20);
    // This get should fail because its variance ((10-0)/10) is way past MAX_VARIANCE (0.5)
    aggrStatsCached = cache.get(DEFAULT_CATALOG_NAME, DB_NAME, tblName, colName, partNames);
    Assert.assertNull(aggrStatsCached);
    // Now prepare partnames with 9 partitions: [tab1part1...tab1part8], which are contained in the
    // object that we added to the cache
    partNames = preparePartNames(tables.get(0), 1, 8);
    // This get should succeed because its variance ((10-9)/9) is within past MAX_VARIANCE (0.5)
    aggrStatsCached = cache.get(DEFAULT_CATALOG_NAME, DB_NAME, tblName, colName, partNames);
    Assert.assertNotNull(aggrStatsCached);
    ColumnStatisticsObj aggrColStatsCached = aggrStatsCached.getColStats();
    Assert.assertEquals(aggrColStats, aggrColStatsCached);
}
Also used : ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) AggrColStats(org.apache.hadoop.hive.metastore.AggregateStatsCache.AggrColStats) BloomFilter(org.apache.hive.common.util.BloomFilter) Test(org.junit.Test) MetastoreUnitTest(org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest)

Example 4 with BloomFilter

use of org.apache.hive.common.util.BloomFilter in project hive by apache.

the class TestAggregateStatsCache method testTimeToLive.

@Test
public void testTimeToLive() throws Exception {
    // Add a dummy node to cache
    // Partnames: [tab1part1...tab1part9]
    List<String> partNames = preparePartNames(tables.get(0), 1, 9);
    // Prepare the bloom filter
    BloomFilter bloomFilter = prepareBloomFilter(partNames);
    // Add a dummy aggregate stats object for the above parts (part1...part9) of tab1 for col1
    String tblName = tables.get(0);
    String colName = tabCols.get(0);
    int highVal = 100, lowVal = 10, numDVs = 50, numNulls = 5;
    // We'll treat this as the aggregate col stats for part1...part9 of tab1, col1
    ColumnStatisticsObj aggrColStats = getDummyLongColStat(colName, highVal, lowVal, numDVs, numNulls);
    // Now add to cache
    cache.add(DEFAULT_CATALOG_NAME, DB_NAME, tblName, colName, 10, aggrColStats, bloomFilter);
    // Sleep for 3 seconds
    Thread.sleep(3000);
    // Get should fail now (since TTL is 2s) and we've snoozed for 3 seconds
    AggrColStats aggrStatsCached = cache.get(DEFAULT_CATALOG_NAME, DB_NAME, tblName, colName, partNames);
    Assert.assertNull(aggrStatsCached);
}
Also used : ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) AggrColStats(org.apache.hadoop.hive.metastore.AggregateStatsCache.AggrColStats) BloomFilter(org.apache.hive.common.util.BloomFilter) Test(org.junit.Test) MetastoreUnitTest(org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest)

Example 5 with BloomFilter

use of org.apache.hive.common.util.BloomFilter in project hive by apache.

the class HBaseReadWrite method putAggregatedStats.

/**
   * Put aggregated stats  Only intended for use by
   * {@link org.apache.hadoop.hive.metastore.hbase.StatsCache}.  Others should not call directly
   * but should call StatsCache.put instead.
   * @param key The md5 hash associated with this partition set
   * @param dbName Database these partitions are in
   * @param tableName Table these partitions are in
   * @param partNames Partition names
   * @param colName Column stats are for
   * @param stats Stats
   * @throws IOException
   */
void putAggregatedStats(byte[] key, String dbName, String tableName, List<String> partNames, String colName, AggrStats stats) throws IOException {
    // Serialize the part names
    List<String> protoNames = new ArrayList<>(partNames.size() + 3);
    protoNames.add(dbName);
    protoNames.add(tableName);
    protoNames.add(colName);
    protoNames.addAll(partNames);
    // Build a bloom Filter for these partitions
    BloomFilter bloom = new BloomFilter(partNames.size(), STATS_BF_ERROR_RATE);
    for (String partName : partNames) {
        bloom.add(partName.getBytes(HBaseUtils.ENCODING));
    }
    byte[] serializedFilter = HBaseUtils.serializeBloomFilter(dbName, tableName, bloom);
    byte[] serializedStats = HBaseUtils.serializeAggrStats(stats);
    store(AGGR_STATS_TABLE, key, CATALOG_CF, new byte[][] { AGGR_STATS_BLOOM_COL, AGGR_STATS_STATS_COL }, new byte[][] { serializedFilter, serializedStats });
}
Also used : ArrayList(java.util.ArrayList) BloomFilter(org.apache.hive.common.util.BloomFilter)

Aggregations

BloomFilter (org.apache.hive.common.util.BloomFilter)7 AggrColStats (org.apache.hadoop.hive.metastore.AggregateStatsCache.AggrColStats)5 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)5 ArrayList (java.util.ArrayList)3 MetastoreUnitTest (org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest)3 Test (org.junit.Test)3 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)2 MConstraint (org.apache.hadoop.hive.metastore.model.MConstraint)2 SQLCheckConstraint (org.apache.hadoop.hive.metastore.api.SQLCheckConstraint)1 SQLDefaultConstraint (org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint)1 SQLNotNullConstraint (org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint)1 SQLUniqueConstraint (org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint)1