Search in sources :

Example 36 with ColumnStatistics

use of org.apache.hadoop.hive.metastore.api.ColumnStatistics in project hive by apache.

the class TestOldSchema method testPartitionOps.

/**
 * Tests partition operations
 */
@Test
public void testPartitionOps() throws Exception {
    String dbName = "default";
    String tableName = "snp";
    Database db1 = new Database(dbName, "description", "locationurl", null);
    store.createDatabase(db1);
    long now = System.currentTimeMillis();
    List<FieldSchema> cols = new ArrayList<>();
    cols.add(new FieldSchema("col1", "long", "nocomment"));
    SerDeInfo serde = new SerDeInfo("serde", "seriallib", null);
    StorageDescriptor sd = new StorageDescriptor(cols, "file:/tmp", "input", "output", false, 0, serde, null, null, Collections.emptyMap());
    List<FieldSchema> partCols = new ArrayList<>();
    partCols.add(new FieldSchema("ds", "string", ""));
    Table table = new Table(tableName, dbName, "me", (int) now, (int) now, 0, sd, partCols, Collections.emptyMap(), null, null, null);
    store.createTable(table);
    Deadline.startTimer("getPartition");
    for (int i = 0; i < 10; i++) {
        List<String> partVal = new ArrayList<>();
        partVal.add(String.valueOf(i));
        StorageDescriptor psd = new StorageDescriptor(sd);
        psd.setLocation("file:/tmp/default/hit/ds=" + partVal);
        Partition part = new Partition(partVal, dbName, tableName, (int) now, (int) now, psd, Collections.emptyMap());
        store.addPartition(part);
        ColumnStatistics cs = new ColumnStatistics();
        ColumnStatisticsDesc desc = new ColumnStatisticsDesc(false, dbName, tableName);
        desc.setLastAnalyzed(now);
        desc.setPartName("ds=" + String.valueOf(i));
        cs.setStatsDesc(desc);
        ColumnStatisticsObj obj = new ColumnStatisticsObj();
        obj.setColName("col1");
        obj.setColType("bigint");
        ColumnStatisticsData data = new ColumnStatisticsData();
        LongColumnStatsData dcsd = new LongColumnStatsData();
        dcsd.setHighValue(1000 + i);
        dcsd.setLowValue(-1000 - i);
        dcsd.setNumNulls(i);
        dcsd.setNumDVs(10 * i + 1);
        dcsd.setBitVectors(bitVectors[0]);
        data.setLongStats(dcsd);
        obj.setStatsData(data);
        cs.addToStatsObj(obj);
        store.updatePartitionColumnStatistics(cs, partVal);
    }
    Checker statChecker = new Checker() {

        @Override
        public void checkStats(AggrStats aggrStats) throws Exception {
            Assert.assertEquals(10, aggrStats.getPartsFound());
            Assert.assertEquals(1, aggrStats.getColStatsSize());
            ColumnStatisticsObj cso = aggrStats.getColStats().get(0);
            Assert.assertEquals("col1", cso.getColName());
            Assert.assertEquals("bigint", cso.getColType());
            LongColumnStatsData lcsd = cso.getStatsData().getLongStats();
            Assert.assertEquals(1009, lcsd.getHighValue(), 0.01);
            Assert.assertEquals(-1009, lcsd.getLowValue(), 0.01);
            Assert.assertEquals(45, lcsd.getNumNulls());
            Assert.assertEquals(91, lcsd.getNumDVs());
        }
    };
    List<String> partNames = new ArrayList<>();
    for (int i = 0; i < 10; i++) {
        partNames.add("ds=" + i);
    }
    AggrStats aggrStats = store.get_aggr_stats_for(dbName, tableName, partNames, Arrays.asList("col1"));
    statChecker.checkStats(aggrStats);
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) Partition(org.apache.hadoop.hive.metastore.api.Partition) Table(org.apache.hadoop.hive.metastore.api.Table) AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) SerDeInfo(org.apache.hadoop.hive.metastore.api.SerDeInfo) ArrayList(java.util.ArrayList) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) LongColumnStatsData(org.apache.hadoop.hive.metastore.api.LongColumnStatsData) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) Database(org.apache.hadoop.hive.metastore.api.Database) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) MetastoreUnitTest(org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest) Test(org.junit.Test)

Example 37 with ColumnStatistics

use of org.apache.hadoop.hive.metastore.api.ColumnStatistics in project hive by apache.

the class TestCachedStore method testAggrStatsRepeatedRead.

@Test
public void testAggrStatsRepeatedRead() throws Exception {
    String dbName = "testTableColStatsOps";
    String tblName = "tbl";
    String colName = "f1";
    Database db = new Database(dbName, null, "some_location", null);
    cachedStore.createDatabase(db);
    List<FieldSchema> cols = new ArrayList<>();
    cols.add(new FieldSchema(colName, "int", null));
    List<FieldSchema> partCols = new ArrayList<>();
    partCols.add(new FieldSchema("col", "int", null));
    StorageDescriptor sd = new StorageDescriptor(cols, null, "input", "output", false, 0, new SerDeInfo("serde", "seriallib", new HashMap<>()), null, null, null);
    Table tbl = new Table(tblName, dbName, null, 0, 0, 0, sd, partCols, new HashMap<>(), null, null, TableType.MANAGED_TABLE.toString());
    cachedStore.createTable(tbl);
    List<String> partVals1 = new ArrayList<>();
    partVals1.add("1");
    List<String> partVals2 = new ArrayList<>();
    partVals2.add("2");
    Partition ptn1 = new Partition(partVals1, dbName, tblName, 0, 0, sd, new HashMap<>());
    cachedStore.addPartition(ptn1);
    Partition ptn2 = new Partition(partVals2, dbName, tblName, 0, 0, sd, new HashMap<>());
    cachedStore.addPartition(ptn2);
    ColumnStatistics stats = new ColumnStatistics();
    ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc(true, dbName, tblName);
    statsDesc.setPartName("col");
    List<ColumnStatisticsObj> colStatObjs = new ArrayList<>();
    ColumnStatisticsData data = new ColumnStatisticsData();
    ColumnStatisticsObj colStats = new ColumnStatisticsObj(colName, "int", data);
    LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector();
    longStats.setLowValue(0);
    longStats.setHighValue(100);
    longStats.setNumNulls(50);
    longStats.setNumDVs(30);
    data.setLongStats(longStats);
    colStatObjs.add(colStats);
    stats.setStatsDesc(statsDesc);
    stats.setStatsObj(colStatObjs);
    cachedStore.updatePartitionColumnStatistics(stats.deepCopy(), partVals1);
    cachedStore.updatePartitionColumnStatistics(stats.deepCopy(), partVals2);
    List<String> colNames = new ArrayList<>();
    colNames.add(colName);
    List<String> aggrPartVals = new ArrayList<>();
    aggrPartVals.add("1");
    aggrPartVals.add("2");
    AggrStats aggrStats = cachedStore.get_aggr_stats_for(dbName, tblName, aggrPartVals, colNames);
    Assert.assertEquals(aggrStats.getColStats().get(0).getStatsData().getLongStats().getNumNulls(), 100);
    aggrStats = cachedStore.get_aggr_stats_for(dbName, tblName, aggrPartVals, colNames);
    Assert.assertEquals(aggrStats.getColStats().get(0).getStatsData().getLongStats().getNumNulls(), 100);
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) Partition(org.apache.hadoop.hive.metastore.api.Partition) Table(org.apache.hadoop.hive.metastore.api.Table) HashMap(java.util.HashMap) AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) SerDeInfo(org.apache.hadoop.hive.metastore.api.SerDeInfo) ArrayList(java.util.ArrayList) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) Database(org.apache.hadoop.hive.metastore.api.Database) LongColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) Test(org.junit.Test) MetastoreCheckinTest(org.apache.hadoop.hive.metastore.annotation.MetastoreCheckinTest)

Example 38 with ColumnStatistics

use of org.apache.hadoop.hive.metastore.api.ColumnStatistics in project hive by apache.

the class TestHiveMetaStore method testColumnStatistics.

@Test
public void testColumnStatistics() throws Throwable {
    String dbName = "columnstatstestdb";
    String tblName = "tbl";
    String typeName = "Person";
    String tblOwner = "testowner";
    int lastAccessed = 6796;
    try {
        cleanUp(dbName, tblName, typeName);
        Database db = new Database();
        db.setName(dbName);
        client.createDatabase(db);
        createTableForTestFilter(dbName, tblName, tblOwner, lastAccessed, true);
        // Create a ColumnStatistics Obj
        String[] colName = new String[] { "income", "name" };
        double lowValue = 50000.21;
        double highValue = 1200000.4525;
        long numNulls = 3;
        long numDVs = 22;
        double avgColLen = 50.30;
        long maxColLen = 102;
        String[] colType = new String[] { "double", "string" };
        boolean isTblLevel = true;
        String partName = null;
        List<ColumnStatisticsObj> statsObjs = new ArrayList<>();
        ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc();
        statsDesc.setDbName(dbName);
        statsDesc.setTableName(tblName);
        statsDesc.setIsTblLevel(isTblLevel);
        statsDesc.setPartName(partName);
        ColumnStatisticsObj statsObj = new ColumnStatisticsObj();
        statsObj.setColName(colName[0]);
        statsObj.setColType(colType[0]);
        ColumnStatisticsData statsData = new ColumnStatisticsData();
        DoubleColumnStatsData numericStats = new DoubleColumnStatsData();
        statsData.setDoubleStats(numericStats);
        statsData.getDoubleStats().setHighValue(highValue);
        statsData.getDoubleStats().setLowValue(lowValue);
        statsData.getDoubleStats().setNumDVs(numDVs);
        statsData.getDoubleStats().setNumNulls(numNulls);
        statsObj.setStatsData(statsData);
        statsObjs.add(statsObj);
        statsObj = new ColumnStatisticsObj();
        statsObj.setColName(colName[1]);
        statsObj.setColType(colType[1]);
        statsData = new ColumnStatisticsData();
        StringColumnStatsData stringStats = new StringColumnStatsData();
        statsData.setStringStats(stringStats);
        statsData.getStringStats().setAvgColLen(avgColLen);
        statsData.getStringStats().setMaxColLen(maxColLen);
        statsData.getStringStats().setNumDVs(numDVs);
        statsData.getStringStats().setNumNulls(numNulls);
        statsObj.setStatsData(statsData);
        statsObjs.add(statsObj);
        ColumnStatistics colStats = new ColumnStatistics();
        colStats.setStatsDesc(statsDesc);
        colStats.setStatsObj(statsObjs);
        // write stats objs persistently
        client.updateTableColumnStatistics(colStats);
        // retrieve the stats obj that was just written
        ColumnStatisticsObj colStats2 = client.getTableColumnStatistics(dbName, tblName, Lists.newArrayList(colName[0])).get(0);
        // compare stats obj to ensure what we get is what we wrote
        assertNotNull(colStats2);
        assertEquals(colStats2.getColName(), colName[0]);
        assertEquals(colStats2.getStatsData().getDoubleStats().getLowValue(), lowValue, 0.01);
        assertEquals(colStats2.getStatsData().getDoubleStats().getHighValue(), highValue, 0.01);
        assertEquals(colStats2.getStatsData().getDoubleStats().getNumNulls(), numNulls);
        assertEquals(colStats2.getStatsData().getDoubleStats().getNumDVs(), numDVs);
        // test delete column stats; if no col name is passed all column stats associated with the
        // table is deleted
        boolean status = client.deleteTableColumnStatistics(dbName, tblName, null);
        assertTrue(status);
        // try to query stats for a column for which stats doesn't exist
        assertTrue(client.getTableColumnStatistics(dbName, tblName, Lists.newArrayList(colName[1])).isEmpty());
        colStats.setStatsDesc(statsDesc);
        colStats.setStatsObj(statsObjs);
        // update table level column stats
        client.updateTableColumnStatistics(colStats);
        // query column stats for column whose stats were updated in the previous call
        colStats2 = client.getTableColumnStatistics(dbName, tblName, Lists.newArrayList(colName[0])).get(0);
        // partition level column statistics test
        // create a table with multiple partitions
        cleanUp(dbName, tblName, typeName);
        List<List<String>> values = new ArrayList<>();
        values.add(makeVals("2008-07-01 14:13:12", "14"));
        values.add(makeVals("2008-07-01 14:13:12", "15"));
        values.add(makeVals("2008-07-02 14:13:12", "15"));
        values.add(makeVals("2008-07-03 14:13:12", "151"));
        createMultiPartitionTableSchema(dbName, tblName, typeName, values);
        List<String> partitions = client.listPartitionNames(dbName, tblName, (short) -1);
        partName = partitions.get(0);
        isTblLevel = false;
        // create a new columnstatistics desc to represent partition level column stats
        statsDesc = new ColumnStatisticsDesc();
        statsDesc.setDbName(dbName);
        statsDesc.setTableName(tblName);
        statsDesc.setPartName(partName);
        statsDesc.setIsTblLevel(isTblLevel);
        colStats = new ColumnStatistics();
        colStats.setStatsDesc(statsDesc);
        colStats.setStatsObj(statsObjs);
        client.updatePartitionColumnStatistics(colStats);
        colStats2 = client.getPartitionColumnStatistics(dbName, tblName, Lists.newArrayList(partName), Lists.newArrayList(colName[1])).get(partName).get(0);
        // compare stats obj to ensure what we get is what we wrote
        assertNotNull(colStats2);
        assertEquals(colStats.getStatsDesc().getPartName(), partName);
        assertEquals(colStats2.getColName(), colName[1]);
        assertEquals(colStats2.getStatsData().getStringStats().getMaxColLen(), maxColLen);
        assertEquals(colStats2.getStatsData().getStringStats().getAvgColLen(), avgColLen, 0.01);
        assertEquals(colStats2.getStatsData().getStringStats().getNumNulls(), numNulls);
        assertEquals(colStats2.getStatsData().getStringStats().getNumDVs(), numDVs);
        // test stats deletion at partition level
        client.deletePartitionColumnStatistics(dbName, tblName, partName, colName[1]);
        colStats2 = client.getPartitionColumnStatistics(dbName, tblName, Lists.newArrayList(partName), Lists.newArrayList(colName[0])).get(partName).get(0);
        // test get stats on a column for which stats doesn't exist
        assertTrue(client.getPartitionColumnStatistics(dbName, tblName, Lists.newArrayList(partName), Lists.newArrayList(colName[1])).isEmpty());
    } catch (Exception e) {
        System.err.println(StringUtils.stringifyException(e));
        System.err.println("testColumnStatistics() failed.");
        throw e;
    } finally {
        cleanUp(dbName, tblName, typeName);
    }
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) ArrayList(java.util.ArrayList) StringColumnStatsData(org.apache.hadoop.hive.metastore.api.StringColumnStatsData) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) InvalidOperationException(org.apache.hadoop.hive.metastore.api.InvalidOperationException) ConfigValSecurityException(org.apache.hadoop.hive.metastore.api.ConfigValSecurityException) SQLException(java.sql.SQLException) UnknownDBException(org.apache.hadoop.hive.metastore.api.UnknownDBException) TException(org.apache.thrift.TException) IOException(java.io.IOException) InvalidObjectException(org.apache.hadoop.hive.metastore.api.InvalidObjectException) NoSuchObjectException(org.apache.hadoop.hive.metastore.api.NoSuchObjectException) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) DoubleColumnStatsData(org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) Database(org.apache.hadoop.hive.metastore.api.Database) List(java.util.List) ArrayList(java.util.ArrayList) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) Test(org.junit.Test)

Example 39 with ColumnStatistics

use of org.apache.hadoop.hive.metastore.api.ColumnStatistics in project hive by apache.

the class MetaStoreDirectSql method makeColumnStats.

private ColumnStatistics makeColumnStats(List<Object[]> list, ColumnStatisticsDesc csd, int offset) throws MetaException {
    ColumnStatistics result = new ColumnStatistics();
    result.setStatsDesc(csd);
    List<ColumnStatisticsObj> csos = new ArrayList<ColumnStatisticsObj>(list.size());
    for (Object[] row : list) {
        // LastAnalyzed is stored per column but thrift has it per several;
        // get the lowest for now as nobody actually uses this field.
        Object laObj = row[offset + 15];
        if (laObj != null && (!csd.isSetLastAnalyzed() || csd.getLastAnalyzed() > extractSqlLong(laObj))) {
            csd.setLastAnalyzed(extractSqlLong(laObj));
        }
        csos.add(prepareCSObj(row, offset));
        Deadline.checkTimeout();
    }
    result.setStatsObj(csos);
    return result;
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) MPartitionColumnStatistics(org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics) MTableColumnStatistics(org.apache.hadoop.hive.metastore.model.MTableColumnStatistics) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ArrayList(java.util.ArrayList)

Example 40 with ColumnStatistics

use of org.apache.hadoop.hive.metastore.api.ColumnStatistics in project hive by apache.

the class MetaStoreDirectSql method getTableStats.

/**
 * Retrieve the column statistics for the specified columns of the table. NULL
 * is returned if the columns are not provided.
 * @param dbName      the database name of the table
 * @param tableName   the table name
 * @param colNames    the list of the column names
 * @return            the column statistics for the specified columns
 * @throws MetaException
 */
public ColumnStatistics getTableStats(final String dbName, final String tableName, List<String> colNames, boolean enableBitVector) throws MetaException {
    if (colNames == null || colNames.isEmpty()) {
        return null;
    }
    final boolean doTrace = LOG.isDebugEnabled();
    final String queryText0 = "select " + getStatsList(enableBitVector) + " from " + TAB_COL_STATS + " " + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ? and \"COLUMN_NAME\" in (";
    Batchable<String, Object[]> b = new Batchable<String, Object[]>() {

        @Override
        public List<Object[]> run(List<String> input) throws MetaException {
            String queryText = queryText0 + makeParams(input.size()) + ")";
            Object[] params = new Object[input.size() + 2];
            params[0] = dbName;
            params[1] = tableName;
            for (int i = 0; i < input.size(); ++i) {
                params[i + 2] = input.get(i);
            }
            long start = doTrace ? System.nanoTime() : 0;
            Query query = pm.newQuery("javax.jdo.query.SQL", queryText);
            Object qResult = executeWithArray(query, params, queryText);
            timingTrace(doTrace, queryText0 + "...)", start, (doTrace ? System.nanoTime() : 0));
            if (qResult == null) {
                query.closeAll();
                return null;
            }
            addQueryAfterUse(query);
            return ensureList(qResult);
        }
    };
    List<Object[]> list = runBatched(colNames, b);
    if (list.isEmpty()) {
        return null;
    }
    ColumnStatisticsDesc csd = new ColumnStatisticsDesc(true, dbName, tableName);
    ColumnStatistics result = makeColumnStats(list, csd, 0);
    b.closeAllQueries();
    return result;
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) MPartitionColumnStatistics(org.apache.hadoop.hive.metastore.model.MPartitionColumnStatistics) MTableColumnStatistics(org.apache.hadoop.hive.metastore.model.MTableColumnStatistics) Query(javax.jdo.Query) MConstraint(org.apache.hadoop.hive.metastore.model.MConstraint) SQLCheckConstraint(org.apache.hadoop.hive.metastore.api.SQLCheckConstraint) SQLNotNullConstraint(org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint) SQLUniqueConstraint(org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint) SQLDefaultConstraint(org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) List(java.util.List) ArrayList(java.util.ArrayList) LinkedList(java.util.LinkedList)

Aggregations

ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)90 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)75 ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)67 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)62 ArrayList (java.util.ArrayList)61 Test (org.junit.Test)53 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)40 Table (org.apache.hadoop.hive.metastore.api.Table)38 Partition (org.apache.hadoop.hive.metastore.api.Partition)33 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)32 SerDeInfo (org.apache.hadoop.hive.metastore.api.SerDeInfo)31 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)29 List (java.util.List)26 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)19 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)14 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)13 DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)12 StringColumnStatsData (org.apache.hadoop.hive.metastore.api.StringColumnStatsData)12 HashMap (java.util.HashMap)11 Database (org.apache.hadoop.hive.metastore.api.Database)9