Search in sources :

Example 76 with ColumnStatisticsDesc

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc in project hive by apache.

the class CachedStore method getTableColumnStatistics.

@Override
public ColumnStatistics getTableColumnStatistics(String dbName, String tblName, List<String> colNames) throws MetaException, NoSuchObjectException {
    dbName = StringUtils.normalizeIdentifier(dbName);
    tblName = StringUtils.normalizeIdentifier(tblName);
    if (!shouldCacheTable(dbName, tblName)) {
        return rawStore.getTableColumnStatistics(dbName, tblName, colNames);
    }
    Table table = sharedCache.getTableFromCache(dbName, tblName);
    if (table == null) {
        // The table is not yet loaded in cache
        return rawStore.getTableColumnStatistics(dbName, tblName, colNames);
    }
    ColumnStatisticsDesc csd = new ColumnStatisticsDesc(true, dbName, tblName);
    List<ColumnStatisticsObj> colStatObjs = sharedCache.getTableColStatsFromCache(dbName, tblName, colNames);
    return new ColumnStatistics(csd, colStatObjs);
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) Table(org.apache.hadoop.hive.metastore.api.Table) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)

Example 77 with ColumnStatisticsDesc

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc in project hive by apache.

the class TestCachedStore method testPartitionAggrStatsBitVector.

@Test
public void testPartitionAggrStatsBitVector() throws Exception {
    String dbName = "testTableColStatsOps2";
    String tblName = "tbl2";
    String colName = "f1";
    Database db = new Database(dbName, null, "some_location", null);
    cachedStore.createDatabase(db);
    List<FieldSchema> cols = new ArrayList<>();
    cols.add(new FieldSchema(colName, "int", null));
    List<FieldSchema> partCols = new ArrayList<>();
    partCols.add(new FieldSchema("col", "int", null));
    StorageDescriptor sd = new StorageDescriptor(cols, null, "input", "output", false, 0, new SerDeInfo("serde", "seriallib", new HashMap<>()), null, null, null);
    Table tbl = new Table(tblName, dbName, null, 0, 0, 0, sd, partCols, new HashMap<>(), null, null, TableType.MANAGED_TABLE.toString());
    cachedStore.createTable(tbl);
    List<String> partVals1 = new ArrayList<>();
    partVals1.add("1");
    List<String> partVals2 = new ArrayList<>();
    partVals2.add("2");
    Partition ptn1 = new Partition(partVals1, dbName, tblName, 0, 0, sd, new HashMap<>());
    cachedStore.addPartition(ptn1);
    Partition ptn2 = new Partition(partVals2, dbName, tblName, 0, 0, sd, new HashMap<>());
    cachedStore.addPartition(ptn2);
    ColumnStatistics stats = new ColumnStatistics();
    ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc(true, dbName, tblName);
    statsDesc.setPartName("col");
    List<ColumnStatisticsObj> colStatObjs = new ArrayList<>();
    ColumnStatisticsData data = new ColumnStatisticsData();
    ColumnStatisticsObj colStats = new ColumnStatisticsObj(colName, "int", data);
    LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector();
    longStats.setLowValue(0);
    longStats.setHighValue(100);
    longStats.setNumNulls(50);
    longStats.setNumDVs(30);
    HyperLogLog hll = HyperLogLog.builder().build();
    hll.addLong(1);
    hll.addLong(2);
    hll.addLong(3);
    longStats.setBitVectors(hll.serialize());
    data.setLongStats(longStats);
    colStatObjs.add(colStats);
    stats.setStatsDesc(statsDesc);
    stats.setStatsObj(colStatObjs);
    cachedStore.updatePartitionColumnStatistics(stats.deepCopy(), partVals1);
    longStats.setNumDVs(40);
    hll = HyperLogLog.builder().build();
    hll.addLong(2);
    hll.addLong(3);
    hll.addLong(4);
    hll.addLong(5);
    longStats.setBitVectors(hll.serialize());
    cachedStore.updatePartitionColumnStatistics(stats.deepCopy(), partVals2);
    List<String> colNames = new ArrayList<>();
    colNames.add(colName);
    List<String> aggrPartVals = new ArrayList<>();
    aggrPartVals.add("1");
    aggrPartVals.add("2");
    AggrStats aggrStats = cachedStore.get_aggr_stats_for(dbName, tblName, aggrPartVals, colNames);
    Assert.assertEquals(aggrStats.getColStats().get(0).getStatsData().getLongStats().getNumNulls(), 100);
    Assert.assertEquals(aggrStats.getColStats().get(0).getStatsData().getLongStats().getNumDVs(), 5);
    aggrStats = cachedStore.get_aggr_stats_for(dbName, tblName, aggrPartVals, colNames);
    Assert.assertEquals(aggrStats.getColStats().get(0).getStatsData().getLongStats().getNumNulls(), 100);
    Assert.assertEquals(aggrStats.getColStats().get(0).getStatsData().getLongStats().getNumDVs(), 5);
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) Partition(org.apache.hadoop.hive.metastore.api.Partition) Table(org.apache.hadoop.hive.metastore.api.Table) HashMap(java.util.HashMap) AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) SerDeInfo(org.apache.hadoop.hive.metastore.api.SerDeInfo) ArrayList(java.util.ArrayList) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) Database(org.apache.hadoop.hive.metastore.api.Database) LongColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) HyperLogLog(org.apache.hadoop.hive.common.ndv.hll.HyperLogLog) Test(org.junit.Test) MetastoreCheckinTest(org.apache.hadoop.hive.metastore.annotation.MetastoreCheckinTest)

Example 78 with ColumnStatisticsDesc

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc in project hive by apache.

the class TestCachedStore method testTableColStatsOps.

// @Test
public void testTableColStatsOps() throws Exception {
    // Add a db via ObjectStore
    String dbName = "testTableColStatsOps";
    String dbOwner = "user1";
    Database db = createTestDb(dbName, dbOwner);
    objectStore.createDatabase(db);
    db = objectStore.getDatabase(dbName);
    // Add a table via ObjectStore
    final String tblName = "tbl";
    final String tblOwner = "user1";
    final FieldSchema col1 = new FieldSchema("col1", "int", "integer column");
    // Stats values for col1
    long col1LowVal = 5;
    long col1HighVal = 500;
    long col1Nulls = 10;
    long col1DV = 20;
    final FieldSchema col2 = new FieldSchema("col2", "string", "string column");
    // Stats values for col2
    long col2MaxColLen = 100;
    double col2AvgColLen = 45.5;
    long col2Nulls = 5;
    long col2DV = 40;
    final FieldSchema col3 = new FieldSchema("col3", "boolean", "boolean column");
    // Stats values for col3
    long col3NumTrues = 100;
    long col3NumFalses = 30;
    long col3Nulls = 10;
    final List<FieldSchema> cols = new ArrayList<>();
    cols.add(col1);
    cols.add(col2);
    cols.add(col3);
    FieldSchema ptnCol1 = new FieldSchema("part1", "string", "string partition column");
    List<FieldSchema> ptnCols = new ArrayList<FieldSchema>();
    ptnCols.add(ptnCol1);
    Table tbl = createTestTbl(dbName, tblName, tblOwner, cols, ptnCols);
    objectStore.createTable(tbl);
    tbl = objectStore.getTable(dbName, tblName);
    // Add ColumnStatistics for tbl to metastore DB via ObjectStore
    ColumnStatistics stats = new ColumnStatistics();
    ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc(true, dbName, tblName);
    List<ColumnStatisticsObj> colStatObjs = new ArrayList<>();
    // Col1
    ColumnStatisticsData data1 = new ColumnStatisticsData();
    ColumnStatisticsObj col1Stats = new ColumnStatisticsObj(col1.getName(), col1.getType(), data1);
    LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector();
    longStats.setLowValue(col1LowVal);
    longStats.setHighValue(col1HighVal);
    longStats.setNumNulls(col1Nulls);
    longStats.setNumDVs(col1DV);
    data1.setLongStats(longStats);
    colStatObjs.add(col1Stats);
    // Col2
    ColumnStatisticsData data2 = new ColumnStatisticsData();
    ColumnStatisticsObj col2Stats = new ColumnStatisticsObj(col2.getName(), col2.getType(), data2);
    StringColumnStatsDataInspector stringStats = new StringColumnStatsDataInspector();
    stringStats.setMaxColLen(col2MaxColLen);
    stringStats.setAvgColLen(col2AvgColLen);
    stringStats.setNumNulls(col2Nulls);
    stringStats.setNumDVs(col2DV);
    data2.setStringStats(stringStats);
    colStatObjs.add(col2Stats);
    // Col3
    ColumnStatisticsData data3 = new ColumnStatisticsData();
    ColumnStatisticsObj col3Stats = new ColumnStatisticsObj(col3.getName(), col3.getType(), data3);
    BooleanColumnStatsData boolStats = new BooleanColumnStatsData();
    boolStats.setNumTrues(col3NumTrues);
    boolStats.setNumFalses(col3NumFalses);
    boolStats.setNumNulls(col3Nulls);
    data3.setBooleanStats(boolStats);
    colStatObjs.add(col3Stats);
    stats.setStatsDesc(statsDesc);
    stats.setStatsObj(colStatObjs);
    // Save to DB
    objectStore.updateTableColumnStatistics(stats);
    // Prewarm CachedStore
    CachedStore.setCachePrewarmedState(false);
    CachedStore.prewarm(objectStore);
    // Read table stats via CachedStore
    ColumnStatistics newStats = cachedStore.getTableColumnStatistics(dbName, tblName, Arrays.asList(col1.getName(), col2.getName(), col3.getName()));
    Assert.assertEquals(stats, newStats);
    // Clean up
    objectStore.dropTable(dbName, tblName);
    objectStore.dropDatabase(dbName);
    sharedCache.getDatabaseCache().clear();
    sharedCache.getTableCache().clear();
    sharedCache.getSdCache().clear();
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) Table(org.apache.hadoop.hive.metastore.api.Table) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ArrayList(java.util.ArrayList) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) Database(org.apache.hadoop.hive.metastore.api.Database) LongColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector) StringColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Example 79 with ColumnStatisticsDesc

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc in project hive by apache.

the class TestCachedStore method testPartitionAggrStats.

@Test
public void testPartitionAggrStats() throws Exception {
    String dbName = "testTableColStatsOps1";
    String tblName = "tbl1";
    String colName = "f1";
    Database db = new Database(dbName, null, "some_location", null);
    cachedStore.createDatabase(db);
    List<FieldSchema> cols = new ArrayList<>();
    cols.add(new FieldSchema(colName, "int", null));
    List<FieldSchema> partCols = new ArrayList<>();
    partCols.add(new FieldSchema("col", "int", null));
    StorageDescriptor sd = new StorageDescriptor(cols, null, "input", "output", false, 0, new SerDeInfo("serde", "seriallib", new HashMap<>()), null, null, null);
    Table tbl = new Table(tblName, dbName, null, 0, 0, 0, sd, partCols, new HashMap<>(), null, null, TableType.MANAGED_TABLE.toString());
    cachedStore.createTable(tbl);
    List<String> partVals1 = new ArrayList<>();
    partVals1.add("1");
    List<String> partVals2 = new ArrayList<>();
    partVals2.add("2");
    Partition ptn1 = new Partition(partVals1, dbName, tblName, 0, 0, sd, new HashMap<>());
    cachedStore.addPartition(ptn1);
    Partition ptn2 = new Partition(partVals2, dbName, tblName, 0, 0, sd, new HashMap<>());
    cachedStore.addPartition(ptn2);
    ColumnStatistics stats = new ColumnStatistics();
    ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc(true, dbName, tblName);
    statsDesc.setPartName("col");
    List<ColumnStatisticsObj> colStatObjs = new ArrayList<>();
    ColumnStatisticsData data = new ColumnStatisticsData();
    ColumnStatisticsObj colStats = new ColumnStatisticsObj(colName, "int", data);
    LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector();
    longStats.setLowValue(0);
    longStats.setHighValue(100);
    longStats.setNumNulls(50);
    longStats.setNumDVs(30);
    data.setLongStats(longStats);
    colStatObjs.add(colStats);
    stats.setStatsDesc(statsDesc);
    stats.setStatsObj(colStatObjs);
    cachedStore.updatePartitionColumnStatistics(stats.deepCopy(), partVals1);
    longStats.setNumDVs(40);
    cachedStore.updatePartitionColumnStatistics(stats.deepCopy(), partVals2);
    List<String> colNames = new ArrayList<>();
    colNames.add(colName);
    List<String> aggrPartVals = new ArrayList<>();
    aggrPartVals.add("1");
    aggrPartVals.add("2");
    AggrStats aggrStats = cachedStore.get_aggr_stats_for(dbName, tblName, aggrPartVals, colNames);
    Assert.assertEquals(aggrStats.getColStats().get(0).getStatsData().getLongStats().getNumNulls(), 100);
    Assert.assertEquals(aggrStats.getColStats().get(0).getStatsData().getLongStats().getNumDVs(), 40);
    aggrStats = cachedStore.get_aggr_stats_for(dbName, tblName, aggrPartVals, colNames);
    Assert.assertEquals(aggrStats.getColStats().get(0).getStatsData().getLongStats().getNumNulls(), 100);
    Assert.assertEquals(aggrStats.getColStats().get(0).getStatsData().getLongStats().getNumDVs(), 40);
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) Partition(org.apache.hadoop.hive.metastore.api.Partition) Table(org.apache.hadoop.hive.metastore.api.Table) HashMap(java.util.HashMap) AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) SerDeInfo(org.apache.hadoop.hive.metastore.api.SerDeInfo) ArrayList(java.util.ArrayList) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) Database(org.apache.hadoop.hive.metastore.api.Database) LongColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) Test(org.junit.Test) MetastoreCheckinTest(org.apache.hadoop.hive.metastore.annotation.MetastoreCheckinTest)

Example 80 with ColumnStatisticsDesc

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc in project hive by apache.

the class ObjectStore method updateTableColumnStatistics.

@Override
public boolean updateTableColumnStatistics(ColumnStatistics colStats) throws NoSuchObjectException, MetaException, InvalidObjectException, InvalidInputException {
    boolean committed = false;
    openTransaction();
    try {
        List<ColumnStatisticsObj> statsObjs = colStats.getStatsObj();
        ColumnStatisticsDesc statsDesc = colStats.getStatsDesc();
        // DataNucleus objects get detached all over the place for no (real) reason.
        // So let's not use them anywhere unless absolutely necessary.
        Table table = ensureGetTable(statsDesc.getDbName(), statsDesc.getTableName());
        List<String> colNames = new ArrayList<>();
        for (ColumnStatisticsObj statsObj : statsObjs) {
            colNames.add(statsObj.getColName());
        }
        Map<String, MTableColumnStatistics> oldStats = getPartitionColStats(table, colNames);
        for (ColumnStatisticsObj statsObj : statsObjs) {
            // We have to get mtable again because DataNucleus.
            MTableColumnStatistics mStatsObj = StatObjectConverter.convertToMTableColumnStatistics(ensureGetMTable(statsDesc.getDbName(), statsDesc.getTableName()), statsDesc, statsObj);
            writeMTableColumnStatistics(table, mStatsObj, oldStats.get(statsObj.getColName()));
        // There is no need to add colname again, otherwise we will get duplicate colNames.
        }
        // Set the table properties
        // No need to check again if it exists.
        String dbname = table.getDbName();
        String name = table.getTableName();
        MTable oldt = getMTable(dbname, name);
        Map<String, String> parameters = table.getParameters();
        StatsSetupConst.setColumnStatsState(parameters, colNames);
        oldt.setParameters(parameters);
        committed = commitTransaction();
        return committed;
    } finally {
        if (!committed) {
            rollbackTransaction();
        }
    }
}
Also used : ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) MVersionTable(org.apache.hadoop.hive.metastore.model.MVersionTable) Table(org.apache.hadoop.hive.metastore.api.Table) MTable(org.apache.hadoop.hive.metastore.model.MTable) MTable(org.apache.hadoop.hive.metastore.model.MTable) MTableColumnStatistics(org.apache.hadoop.hive.metastore.model.MTableColumnStatistics) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) ArrayList(java.util.ArrayList)

Aggregations

ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)81 ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)69 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)63 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)56 ArrayList (java.util.ArrayList)54 Test (org.junit.Test)53 Table (org.apache.hadoop.hive.metastore.api.Table)37 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)36 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)32 Partition (org.apache.hadoop.hive.metastore.api.Partition)31 SerDeInfo (org.apache.hadoop.hive.metastore.api.SerDeInfo)31 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)28 List (java.util.List)22 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)18 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)12 DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)11 StringColumnStatsData (org.apache.hadoop.hive.metastore.api.StringColumnStatsData)11 Database (org.apache.hadoop.hive.metastore.api.Database)7 DecimalColumnStatsData (org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData)7 LongColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector)5