Search in sources :

Example 1 with AggrStats

use of org.apache.hadoop.hive.metastore.api.AggrStats in project hive by apache.

the class MetaStoreDirectSql method aggrColStatsForPartitions.

public AggrStats aggrColStatsForPartitions(String dbName, String tableName, List<String> partNames, List<String> colNames, boolean useDensityFunctionForNDVEstimation) throws MetaException {
    if (colNames.isEmpty() || partNames.isEmpty()) {
        LOG.debug("Columns is empty or partNames is empty : Short-circuiting stats eval");
        // Nothing to aggregate
        return new AggrStats(new ArrayList<ColumnStatisticsObj>(), 0);
    }
    long partsFound = 0;
    List<ColumnStatisticsObj> colStatsList;
    // Try to read from the cache first
    if (isAggregateStatsCacheEnabled && (partNames.size() < aggrStatsCache.getMaxPartsPerCacheNode())) {
        AggrColStats colStatsAggrCached;
        List<ColumnStatisticsObj> colStatsAggrFromDB;
        int maxPartsPerCacheNode = aggrStatsCache.getMaxPartsPerCacheNode();
        float fpp = aggrStatsCache.getFalsePositiveProbability();
        colStatsList = new ArrayList<ColumnStatisticsObj>();
        // Bloom filter for the new node that we will eventually add to the cache
        BloomFilter bloomFilter = createPartsBloomFilter(maxPartsPerCacheNode, fpp, partNames);
        boolean computePartsFound = true;
        for (String colName : colNames) {
            // Check the cache first
            colStatsAggrCached = aggrStatsCache.get(dbName, tableName, colName, partNames);
            if (colStatsAggrCached != null) {
                colStatsList.add(colStatsAggrCached.getColStats());
                partsFound = colStatsAggrCached.getNumPartsCached();
            } else {
                if (computePartsFound) {
                    partsFound = partsFoundForPartitions(dbName, tableName, partNames, colNames);
                    computePartsFound = false;
                }
                List<String> colNamesForDB = new ArrayList<String>();
                colNamesForDB.add(colName);
                // Read aggregated stats for one column
                colStatsAggrFromDB = columnStatisticsObjForPartitions(dbName, tableName, partNames, colNamesForDB, partsFound, useDensityFunctionForNDVEstimation);
                if (!colStatsAggrFromDB.isEmpty()) {
                    ColumnStatisticsObj colStatsAggr = colStatsAggrFromDB.get(0);
                    colStatsList.add(colStatsAggr);
                    // Update the cache to add this new aggregate node
                    aggrStatsCache.add(dbName, tableName, colName, partsFound, colStatsAggr, bloomFilter);
                }
            }
        }
    } else {
        partsFound = partsFoundForPartitions(dbName, tableName, partNames, colNames);
        colStatsList = columnStatisticsObjForPartitions(dbName, tableName, partNames, colNames, partsFound, useDensityFunctionForNDVEstimation);
    }
    LOG.info("useDensityFunctionForNDVEstimation = " + useDensityFunctionForNDVEstimation + "\npartsFound = " + partsFound + "\nColumnStatisticsObj = " + Arrays.toString(colStatsList.toArray()));
    return new AggrStats(colStatsList, partsFound);
}
Also used : AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) ArrayList(java.util.ArrayList) AggrColStats(org.apache.hadoop.hive.metastore.AggregateStatsCache.AggrColStats) MConstraint(org.apache.hadoop.hive.metastore.model.MConstraint) BloomFilter(org.apache.hive.common.util.BloomFilter) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)

Example 2 with AggrStats

use of org.apache.hadoop.hive.metastore.api.AggrStats in project hive by apache.

the class HBaseUtils method deserializeAggrStats.

static AggrStats deserializeAggrStats(byte[] serialized) throws IOException {
    HbaseMetastoreProto.AggrStats protoAggrStats = HbaseMetastoreProto.AggrStats.parseFrom(serialized);
    AggrStats aggrStats = new AggrStats();
    aggrStats.setPartsFound(protoAggrStats.getPartsFound());
    for (HbaseMetastoreProto.ColumnStats protoCS : protoAggrStats.getColStatsList()) {
        aggrStats.addToColStats(statsForOneColumnFromProtoBuf(null, protoCS));
    }
    return aggrStats;
}
Also used : AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats)

Example 3 with AggrStats

use of org.apache.hadoop.hive.metastore.api.AggrStats in project hive by apache.

the class TestHBaseAggrStatsCacheIntegration method alterInvalidation.

@Test
public void alterInvalidation() throws Exception {
    try {
        String dbName = "default";
        String tableName = "ai";
        List<String> partVals1 = Arrays.asList("today");
        List<String> partVals2 = Arrays.asList("yesterday");
        List<String> partVals3 = Arrays.asList("tomorrow");
        long now = System.currentTimeMillis();
        List<FieldSchema> cols = new ArrayList<>();
        cols.add(new FieldSchema("col1", "boolean", "nocomment"));
        SerDeInfo serde = new SerDeInfo("serde", "seriallib", null);
        StorageDescriptor sd = new StorageDescriptor(cols, "file:/tmp", "input", "output", false, 0, serde, null, null, Collections.<String, String>emptyMap());
        List<FieldSchema> partCols = new ArrayList<>();
        partCols.add(new FieldSchema("ds", "string", ""));
        Table table = new Table(tableName, dbName, "me", (int) now, (int) now, 0, sd, partCols, Collections.<String, String>emptyMap(), null, null, null);
        store.createTable(table);
        Partition[] partitions = new Partition[3];
        int partnum = 0;
        for (List<String> partVals : Arrays.asList(partVals1, partVals2, partVals3)) {
            StorageDescriptor psd = new StorageDescriptor(sd);
            psd.setLocation("file:/tmp/default/invalidation/ds=" + partVals.get(0));
            Partition part = new Partition(partVals, dbName, tableName, (int) now, (int) now, psd, Collections.<String, String>emptyMap());
            partitions[partnum++] = part;
            store.addPartition(part);
            ColumnStatistics cs = new ColumnStatistics();
            ColumnStatisticsDesc desc = new ColumnStatisticsDesc(false, dbName, tableName);
            desc.setLastAnalyzed(now);
            desc.setPartName("ds=" + partVals.get(0));
            cs.setStatsDesc(desc);
            ColumnStatisticsObj obj = new ColumnStatisticsObj();
            obj.setColName("col1");
            obj.setColType("boolean");
            ColumnStatisticsData data = new ColumnStatisticsData();
            BooleanColumnStatsData bcsd = new BooleanColumnStatsData();
            bcsd.setNumFalses(10);
            bcsd.setNumTrues(20);
            bcsd.setNumNulls(30);
            data.setBooleanStats(bcsd);
            obj.setStatsData(data);
            cs.addToStatsObj(obj);
            store.updatePartitionColumnStatistics(cs, partVals);
        }
        AggrStats aggrStats = store.get_aggr_stats_for(dbName, tableName, Arrays.asList("ds=today", "ds=tomorrow"), Arrays.asList("col1"));
        aggrStats = store.get_aggr_stats_for(dbName, tableName, Arrays.asList("ds=today", "ds=yesterday"), Arrays.asList("col1"));
        // Check that we had to build it from the stats
        Assert.assertEquals(0, store.backdoor().getStatsCache().hbaseHits.getCnt());
        Assert.assertEquals(2, store.backdoor().getStatsCache().totalGets.getCnt());
        Assert.assertEquals(2, store.backdoor().getStatsCache().misses.getCnt());
        // wake the invalidator and check again to make sure it isn't too aggressive about
        // removing our stuff.
        store.backdoor().getStatsCache().wakeInvalidator();
        Partition newPart = new Partition(partitions[2]);
        newPart.setLastAccessTime((int) System.currentTimeMillis());
        store.alterPartition(dbName, tableName, partVals3, newPart);
        store.backdoor().getStatsCache().setRunInvalidatorEvery(100);
        store.backdoor().getStatsCache().wakeInvalidator();
        aggrStats = store.get_aggr_stats_for(dbName, tableName, Arrays.asList("ds=tomorrow", "ds=today"), Arrays.asList("col1"));
        // Check that we missed, which means this aggregate was dropped from the cache.
        Assert.assertEquals(0, store.backdoor().getStatsCache().hbaseHits.getCnt());
        Assert.assertEquals(3, store.backdoor().getStatsCache().totalGets.getCnt());
        Assert.assertEquals(3, store.backdoor().getStatsCache().misses.getCnt());
        // Check that our other aggregate is still in the cache.
        aggrStats = store.get_aggr_stats_for(dbName, tableName, Arrays.asList("ds=yesterday", "ds=today"), Arrays.asList("col1"));
        Assert.assertEquals(0, store.backdoor().getStatsCache().hbaseHits.getCnt());
        Assert.assertEquals(4, store.backdoor().getStatsCache().totalGets.getCnt());
        Assert.assertEquals(3, store.backdoor().getStatsCache().misses.getCnt());
    } finally {
        store.backdoor().getStatsCache().setRunInvalidatorEvery(5000);
        store.backdoor().getStatsCache().setMaxTimeInCache(500000);
        store.backdoor().getStatsCache().wakeInvalidator();
    }
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) Partition(org.apache.hadoop.hive.metastore.api.Partition) Table(org.apache.hadoop.hive.metastore.api.Table) AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) SerDeInfo(org.apache.hadoop.hive.metastore.api.SerDeInfo) ArrayList(java.util.ArrayList) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) Test(org.junit.Test)

Example 4 with AggrStats

use of org.apache.hadoop.hive.metastore.api.AggrStats in project hive by apache.

the class TestHBaseAggregateStatsCache method allWithStats.

// Do to limitations in the Mock infrastructure we use for HBase testing we can only test
// this for a single column table and we can't really test hits in hbase, only in memory or
// build from scratch.  But it's still useful to cover many bugs.  More in depth testing with
// multiple columns and with HBase hits is done in TestHBaseAggrStatsCacheIntegration.
@Test
public void allWithStats() throws Exception {
    String dbName = "default";
    String tableName = "hit";
    List<String> partVals1 = Arrays.asList("today");
    List<String> partVals2 = Arrays.asList("yesterday");
    long now = System.currentTimeMillis();
    List<FieldSchema> cols = new ArrayList<>();
    cols.add(new FieldSchema("col1", "boolean", "nocomment"));
    SerDeInfo serde = new SerDeInfo("serde", "seriallib", null);
    StorageDescriptor sd = new StorageDescriptor(cols, "file:/tmp", "input", "output", false, 0, serde, null, null, Collections.<String, String>emptyMap());
    List<FieldSchema> partCols = new ArrayList<>();
    partCols.add(new FieldSchema("ds", "string", ""));
    Table table = new Table(tableName, dbName, "me", (int) now, (int) now, 0, sd, partCols, Collections.<String, String>emptyMap(), null, null, null);
    store.createTable(table);
    for (List<String> partVals : Arrays.asList(partVals1, partVals2)) {
        StorageDescriptor psd = new StorageDescriptor(sd);
        psd.setLocation("file:/tmp/default/hit/ds=" + partVals.get(0));
        Partition part = new Partition(partVals, dbName, tableName, (int) now, (int) now, psd, Collections.<String, String>emptyMap());
        store.addPartition(part);
        ColumnStatistics cs = new ColumnStatistics();
        ColumnStatisticsDesc desc = new ColumnStatisticsDesc(false, dbName, tableName);
        desc.setLastAnalyzed(now);
        desc.setPartName("ds=" + partVals.get(0));
        cs.setStatsDesc(desc);
        ColumnStatisticsObj obj = new ColumnStatisticsObj();
        obj.setColName("col1");
        obj.setColType("boolean");
        ColumnStatisticsData data = new ColumnStatisticsData();
        BooleanColumnStatsData bcsd = new BooleanColumnStatsData();
        bcsd.setNumFalses(10);
        bcsd.setNumTrues(20);
        bcsd.setNumNulls(30);
        data.setBooleanStats(bcsd);
        obj.setStatsData(data);
        cs.addToStatsObj(obj);
        store.updatePartitionColumnStatistics(cs, partVals);
    }
    Checker statChecker = new Checker() {

        @Override
        public void checkStats(AggrStats aggrStats) throws Exception {
            Assert.assertEquals(2, aggrStats.getPartsFound());
            Assert.assertEquals(1, aggrStats.getColStatsSize());
            ColumnStatisticsObj cso = aggrStats.getColStats().get(0);
            Assert.assertEquals("col1", cso.getColName());
            Assert.assertEquals("boolean", cso.getColType());
            BooleanColumnStatsData bcsd = cso.getStatsData().getBooleanStats();
            Assert.assertEquals(20, bcsd.getNumFalses());
            Assert.assertEquals(40, bcsd.getNumTrues());
            Assert.assertEquals(60, bcsd.getNumNulls());
        }
    };
    AggrStats aggrStats = store.get_aggr_stats_for(dbName, tableName, Arrays.asList("ds=today", "ds=yesterday"), Arrays.asList("col1"));
    statChecker.checkStats(aggrStats);
    // Check that we had to build it from the stats
    Assert.assertEquals(0, store.backdoor().getStatsCache().hbaseHits.getCnt());
    Assert.assertEquals(1, store.backdoor().getStatsCache().totalGets.getCnt());
    Assert.assertEquals(1, store.backdoor().getStatsCache().misses.getCnt());
    // Call again, this time it should come from memory.  Also, reverse the name order this time
    // to assure that we still hit.
    aggrStats = store.get_aggr_stats_for(dbName, tableName, Arrays.asList("ds=yesterday", "ds=today"), Arrays.asList("col1"));
    statChecker.checkStats(aggrStats);
    Assert.assertEquals(0, store.backdoor().getStatsCache().hbaseHits.getCnt());
    Assert.assertEquals(2, store.backdoor().getStatsCache().totalGets.getCnt());
    Assert.assertEquals(1, store.backdoor().getStatsCache().misses.getCnt());
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) Partition(org.apache.hadoop.hive.metastore.api.Partition) Table(org.apache.hadoop.hive.metastore.api.Table) AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) SerDeInfo(org.apache.hadoop.hive.metastore.api.SerDeInfo) ArrayList(java.util.ArrayList) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) Test(org.junit.Test)

Example 5 with AggrStats

use of org.apache.hadoop.hive.metastore.api.AggrStats in project hive by apache.

the class TestHBaseAggregateStatsCache method someNonexistentPartitions.

@Test
public void someNonexistentPartitions() throws Exception {
    String dbName = "default";
    String tableName = "snp";
    List<String> partVals1 = Arrays.asList("today");
    List<String> partVals2 = Arrays.asList("yesterday");
    long now = System.currentTimeMillis();
    List<FieldSchema> cols = new ArrayList<>();
    cols.add(new FieldSchema("col1", "boolean", "nocomment"));
    SerDeInfo serde = new SerDeInfo("serde", "seriallib", null);
    StorageDescriptor sd = new StorageDescriptor(cols, "file:/tmp", "input", "output", false, 0, serde, null, null, Collections.<String, String>emptyMap());
    List<FieldSchema> partCols = new ArrayList<>();
    partCols.add(new FieldSchema("ds", "string", ""));
    Table table = new Table(tableName, dbName, "me", (int) now, (int) now, 0, sd, partCols, Collections.<String, String>emptyMap(), null, null, null);
    store.createTable(table);
    StorageDescriptor psd = new StorageDescriptor(sd);
    psd.setLocation("file:/tmp/default/hit/ds=" + partVals1.get(0));
    Partition part = new Partition(partVals1, dbName, tableName, (int) now, (int) now, psd, Collections.<String, String>emptyMap());
    store.addPartition(part);
    ColumnStatistics cs = new ColumnStatistics();
    ColumnStatisticsDesc desc = new ColumnStatisticsDesc(false, dbName, tableName);
    desc.setLastAnalyzed(now);
    desc.setPartName("ds=" + partVals1.get(0));
    cs.setStatsDesc(desc);
    ColumnStatisticsObj obj = new ColumnStatisticsObj();
    obj.setColName("col1");
    obj.setColType("double");
    ColumnStatisticsData data = new ColumnStatisticsData();
    DoubleColumnStatsData dcsd = new DoubleColumnStatsData();
    dcsd.setHighValue(1000.2342343);
    dcsd.setLowValue(-20.1234213423);
    dcsd.setNumNulls(30);
    dcsd.setNumDVs(12342);
    data.setDoubleStats(dcsd);
    obj.setStatsData(data);
    cs.addToStatsObj(obj);
    store.updatePartitionColumnStatistics(cs, partVals1);
    Checker statChecker = new Checker() {

        @Override
        public void checkStats(AggrStats aggrStats) throws Exception {
            Assert.assertEquals(1, aggrStats.getPartsFound());
            Assert.assertEquals(1, aggrStats.getColStatsSize());
            ColumnStatisticsObj cso = aggrStats.getColStats().get(0);
            Assert.assertEquals("col1", cso.getColName());
            Assert.assertEquals("double", cso.getColType());
            DoubleColumnStatsData dcsd = cso.getStatsData().getDoubleStats();
            Assert.assertEquals(1000.23, dcsd.getHighValue(), 0.01);
            Assert.assertEquals(-20.12, dcsd.getLowValue(), 0.01);
            Assert.assertEquals(30, dcsd.getNumNulls());
            Assert.assertEquals(12342, dcsd.getNumDVs());
        }
    };
    AggrStats aggrStats = store.get_aggr_stats_for(dbName, tableName, Arrays.asList("ds=today", "ds=yesterday"), Arrays.asList("col1"));
    statChecker.checkStats(aggrStats);
    // Check that we had to build it from the stats
    Assert.assertEquals(0, store.backdoor().getStatsCache().hbaseHits.getCnt());
    Assert.assertEquals(1, store.backdoor().getStatsCache().totalGets.getCnt());
    Assert.assertEquals(1, store.backdoor().getStatsCache().misses.getCnt());
    // Call again, this time it should come from memory.  Also, reverse the name order this time
    // to assure that we still hit.
    aggrStats = store.get_aggr_stats_for(dbName, tableName, Arrays.asList("ds=yesterday", "ds=today"), Arrays.asList("col1"));
    statChecker.checkStats(aggrStats);
    Assert.assertEquals(0, store.backdoor().getStatsCache().hbaseHits.getCnt());
    Assert.assertEquals(2, store.backdoor().getStatsCache().totalGets.getCnt());
    Assert.assertEquals(1, store.backdoor().getStatsCache().misses.getCnt());
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) Partition(org.apache.hadoop.hive.metastore.api.Partition) Table(org.apache.hadoop.hive.metastore.api.Table) AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) SerDeInfo(org.apache.hadoop.hive.metastore.api.SerDeInfo) ArrayList(java.util.ArrayList) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) DoubleColumnStatsData(org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) Test(org.junit.Test)

Aggregations

AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)42 ArrayList (java.util.ArrayList)37 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)29 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)29 Partition (org.apache.hadoop.hive.metastore.api.Partition)28 Table (org.apache.hadoop.hive.metastore.api.Table)28 Test (org.junit.Test)28 SerDeInfo (org.apache.hadoop.hive.metastore.api.SerDeInfo)27 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)27 ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)26 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)26 ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)25 List (java.util.List)21 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)12 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)5 DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)5 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)4 IOException (java.io.IOException)3 SQLCheckConstraint (org.apache.hadoop.hive.metastore.api.SQLCheckConstraint)3 SQLDefaultConstraint (org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint)3