use of org.apache.hadoop.hive.metastore.api.ColumnStatistics in project hive by apache.
the class TestOldSchema method testPartitionOps.
/**
* Tests partition operations
*/
@Test
public void testPartitionOps() throws Exception {
String dbName = "default";
String tableName = "snp";
Database db1 = new Database(dbName, "description", "locationurl", null);
store.createDatabase(db1);
long now = System.currentTimeMillis();
List<FieldSchema> cols = new ArrayList<>();
cols.add(new FieldSchema("col1", "long", "nocomment"));
SerDeInfo serde = new SerDeInfo("serde", "seriallib", null);
StorageDescriptor sd = new StorageDescriptor(cols, "file:/tmp", "input", "output", false, 0, serde, null, null, Collections.emptyMap());
List<FieldSchema> partCols = new ArrayList<>();
partCols.add(new FieldSchema("ds", "string", ""));
Table table = new Table(tableName, dbName, "me", (int) now, (int) now, 0, sd, partCols, Collections.emptyMap(), null, null, null);
store.createTable(table);
Deadline.startTimer("getPartition");
for (int i = 0; i < 10; i++) {
List<String> partVal = new ArrayList<>();
partVal.add(String.valueOf(i));
StorageDescriptor psd = new StorageDescriptor(sd);
psd.setLocation("file:/tmp/default/hit/ds=" + partVal);
Partition part = new Partition(partVal, dbName, tableName, (int) now, (int) now, psd, Collections.emptyMap());
store.addPartition(part);
ColumnStatistics cs = new ColumnStatistics();
ColumnStatisticsDesc desc = new ColumnStatisticsDesc(false, dbName, tableName);
desc.setLastAnalyzed(now);
desc.setPartName("ds=" + String.valueOf(i));
cs.setStatsDesc(desc);
ColumnStatisticsObj obj = new ColumnStatisticsObj();
obj.setColName("col1");
obj.setColType("bigint");
ColumnStatisticsData data = new ColumnStatisticsData();
LongColumnStatsData dcsd = new LongColumnStatsData();
dcsd.setHighValue(1000 + i);
dcsd.setLowValue(-1000 - i);
dcsd.setNumNulls(i);
dcsd.setNumDVs(10 * i + 1);
dcsd.setBitVectors(bitVectors[0]);
data.setLongStats(dcsd);
obj.setStatsData(data);
cs.addToStatsObj(obj);
store.updatePartitionColumnStatistics(cs, partVal);
}
Checker statChecker = new Checker() {
@Override
public void checkStats(AggrStats aggrStats) throws Exception {
Assert.assertEquals(10, aggrStats.getPartsFound());
Assert.assertEquals(1, aggrStats.getColStatsSize());
ColumnStatisticsObj cso = aggrStats.getColStats().get(0);
Assert.assertEquals("col1", cso.getColName());
Assert.assertEquals("bigint", cso.getColType());
LongColumnStatsData lcsd = cso.getStatsData().getLongStats();
Assert.assertEquals(1009, lcsd.getHighValue(), 0.01);
Assert.assertEquals(-1009, lcsd.getLowValue(), 0.01);
Assert.assertEquals(45, lcsd.getNumNulls());
Assert.assertEquals(91, lcsd.getNumDVs());
}
};
List<String> partNames = new ArrayList<>();
for (int i = 0; i < 10; i++) {
partNames.add("ds=" + i);
}
AggrStats aggrStats = store.get_aggr_stats_for(dbName, tableName, partNames, Arrays.asList("col1"));
statChecker.checkStats(aggrStats);
}
use of org.apache.hadoop.hive.metastore.api.ColumnStatistics in project hive by apache.
the class TestCachedStore method testAggrStatsRepeatedRead.
@Test
public void testAggrStatsRepeatedRead() throws Exception {
String dbName = "testTableColStatsOps";
String tblName = "tbl";
String colName = "f1";
Database db = new Database(dbName, null, "some_location", null);
cachedStore.createDatabase(db);
List<FieldSchema> cols = new ArrayList<>();
cols.add(new FieldSchema(colName, "int", null));
List<FieldSchema> partCols = new ArrayList<>();
partCols.add(new FieldSchema("col", "int", null));
StorageDescriptor sd = new StorageDescriptor(cols, null, "input", "output", false, 0, new SerDeInfo("serde", "seriallib", new HashMap<>()), null, null, null);
Table tbl = new Table(tblName, dbName, null, 0, 0, 0, sd, partCols, new HashMap<>(), null, null, TableType.MANAGED_TABLE.toString());
cachedStore.createTable(tbl);
List<String> partVals1 = new ArrayList<>();
partVals1.add("1");
List<String> partVals2 = new ArrayList<>();
partVals2.add("2");
Partition ptn1 = new Partition(partVals1, dbName, tblName, 0, 0, sd, new HashMap<>());
cachedStore.addPartition(ptn1);
Partition ptn2 = new Partition(partVals2, dbName, tblName, 0, 0, sd, new HashMap<>());
cachedStore.addPartition(ptn2);
ColumnStatistics stats = new ColumnStatistics();
ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc(true, dbName, tblName);
statsDesc.setPartName("col");
List<ColumnStatisticsObj> colStatObjs = new ArrayList<>();
ColumnStatisticsData data = new ColumnStatisticsData();
ColumnStatisticsObj colStats = new ColumnStatisticsObj(colName, "int", data);
LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector();
longStats.setLowValue(0);
longStats.setHighValue(100);
longStats.setNumNulls(50);
longStats.setNumDVs(30);
data.setLongStats(longStats);
colStatObjs.add(colStats);
stats.setStatsDesc(statsDesc);
stats.setStatsObj(colStatObjs);
cachedStore.updatePartitionColumnStatistics(stats.deepCopy(), partVals1);
cachedStore.updatePartitionColumnStatistics(stats.deepCopy(), partVals2);
List<String> colNames = new ArrayList<>();
colNames.add(colName);
List<String> aggrPartVals = new ArrayList<>();
aggrPartVals.add("1");
aggrPartVals.add("2");
AggrStats aggrStats = cachedStore.get_aggr_stats_for(dbName, tblName, aggrPartVals, colNames);
Assert.assertEquals(aggrStats.getColStats().get(0).getStatsData().getLongStats().getNumNulls(), 100);
aggrStats = cachedStore.get_aggr_stats_for(dbName, tblName, aggrPartVals, colNames);
Assert.assertEquals(aggrStats.getColStats().get(0).getStatsData().getLongStats().getNumNulls(), 100);
}
use of org.apache.hadoop.hive.metastore.api.ColumnStatistics in project hive by apache.
the class TestHiveMetaStore method testColumnStatistics.
@Test
public void testColumnStatistics() throws Throwable {
String dbName = "columnstatstestdb";
String tblName = "tbl";
String typeName = "Person";
String tblOwner = "testowner";
int lastAccessed = 6796;
try {
cleanUp(dbName, tblName, typeName);
Database db = new Database();
db.setName(dbName);
client.createDatabase(db);
createTableForTestFilter(dbName, tblName, tblOwner, lastAccessed, true);
// Create a ColumnStatistics Obj
String[] colName = new String[] { "income", "name" };
double lowValue = 50000.21;
double highValue = 1200000.4525;
long numNulls = 3;
long numDVs = 22;
double avgColLen = 50.30;
long maxColLen = 102;
String[] colType = new String[] { "double", "string" };
boolean isTblLevel = true;
String partName = null;
List<ColumnStatisticsObj> statsObjs = new ArrayList<>();
ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc();
statsDesc.setDbName(dbName);
statsDesc.setTableName(tblName);
statsDesc.setIsTblLevel(isTblLevel);
statsDesc.setPartName(partName);
ColumnStatisticsObj statsObj = new ColumnStatisticsObj();
statsObj.setColName(colName[0]);
statsObj.setColType(colType[0]);
ColumnStatisticsData statsData = new ColumnStatisticsData();
DoubleColumnStatsData numericStats = new DoubleColumnStatsData();
statsData.setDoubleStats(numericStats);
statsData.getDoubleStats().setHighValue(highValue);
statsData.getDoubleStats().setLowValue(lowValue);
statsData.getDoubleStats().setNumDVs(numDVs);
statsData.getDoubleStats().setNumNulls(numNulls);
statsObj.setStatsData(statsData);
statsObjs.add(statsObj);
statsObj = new ColumnStatisticsObj();
statsObj.setColName(colName[1]);
statsObj.setColType(colType[1]);
statsData = new ColumnStatisticsData();
StringColumnStatsData stringStats = new StringColumnStatsData();
statsData.setStringStats(stringStats);
statsData.getStringStats().setAvgColLen(avgColLen);
statsData.getStringStats().setMaxColLen(maxColLen);
statsData.getStringStats().setNumDVs(numDVs);
statsData.getStringStats().setNumNulls(numNulls);
statsObj.setStatsData(statsData);
statsObjs.add(statsObj);
ColumnStatistics colStats = new ColumnStatistics();
colStats.setStatsDesc(statsDesc);
colStats.setStatsObj(statsObjs);
// write stats objs persistently
client.updateTableColumnStatistics(colStats);
// retrieve the stats obj that was just written
ColumnStatisticsObj colStats2 = client.getTableColumnStatistics(dbName, tblName, Lists.newArrayList(colName[0])).get(0);
// compare stats obj to ensure what we get is what we wrote
assertNotNull(colStats2);
assertEquals(colStats2.getColName(), colName[0]);
assertEquals(colStats2.getStatsData().getDoubleStats().getLowValue(), lowValue, 0.01);
assertEquals(colStats2.getStatsData().getDoubleStats().getHighValue(), highValue, 0.01);
assertEquals(colStats2.getStatsData().getDoubleStats().getNumNulls(), numNulls);
assertEquals(colStats2.getStatsData().getDoubleStats().getNumDVs(), numDVs);
// test delete column stats; if no col name is passed all column stats associated with the
// table is deleted
boolean status = client.deleteTableColumnStatistics(dbName, tblName, null);
assertTrue(status);
// try to query stats for a column for which stats doesn't exist
assertTrue(client.getTableColumnStatistics(dbName, tblName, Lists.newArrayList(colName[1])).isEmpty());
colStats.setStatsDesc(statsDesc);
colStats.setStatsObj(statsObjs);
// update table level column stats
client.updateTableColumnStatistics(colStats);
// query column stats for column whose stats were updated in the previous call
colStats2 = client.getTableColumnStatistics(dbName, tblName, Lists.newArrayList(colName[0])).get(0);
// partition level column statistics test
// create a table with multiple partitions
cleanUp(dbName, tblName, typeName);
List<List<String>> values = new ArrayList<>();
values.add(makeVals("2008-07-01 14:13:12", "14"));
values.add(makeVals("2008-07-01 14:13:12", "15"));
values.add(makeVals("2008-07-02 14:13:12", "15"));
values.add(makeVals("2008-07-03 14:13:12", "151"));
createMultiPartitionTableSchema(dbName, tblName, typeName, values);
List<String> partitions = client.listPartitionNames(dbName, tblName, (short) -1);
partName = partitions.get(0);
isTblLevel = false;
// create a new columnstatistics desc to represent partition level column stats
statsDesc = new ColumnStatisticsDesc();
statsDesc.setDbName(dbName);
statsDesc.setTableName(tblName);
statsDesc.setPartName(partName);
statsDesc.setIsTblLevel(isTblLevel);
colStats = new ColumnStatistics();
colStats.setStatsDesc(statsDesc);
colStats.setStatsObj(statsObjs);
client.updatePartitionColumnStatistics(colStats);
colStats2 = client.getPartitionColumnStatistics(dbName, tblName, Lists.newArrayList(partName), Lists.newArrayList(colName[1])).get(partName).get(0);
// compare stats obj to ensure what we get is what we wrote
assertNotNull(colStats2);
assertEquals(colStats.getStatsDesc().getPartName(), partName);
assertEquals(colStats2.getColName(), colName[1]);
assertEquals(colStats2.getStatsData().getStringStats().getMaxColLen(), maxColLen);
assertEquals(colStats2.getStatsData().getStringStats().getAvgColLen(), avgColLen, 0.01);
assertEquals(colStats2.getStatsData().getStringStats().getNumNulls(), numNulls);
assertEquals(colStats2.getStatsData().getStringStats().getNumDVs(), numDVs);
// test stats deletion at partition level
client.deletePartitionColumnStatistics(dbName, tblName, partName, colName[1]);
colStats2 = client.getPartitionColumnStatistics(dbName, tblName, Lists.newArrayList(partName), Lists.newArrayList(colName[0])).get(partName).get(0);
// test get stats on a column for which stats doesn't exist
assertTrue(client.getPartitionColumnStatistics(dbName, tblName, Lists.newArrayList(partName), Lists.newArrayList(colName[1])).isEmpty());
} catch (Exception e) {
System.err.println(StringUtils.stringifyException(e));
System.err.println("testColumnStatistics() failed.");
throw e;
} finally {
cleanUp(dbName, tblName, typeName);
}
}
use of org.apache.hadoop.hive.metastore.api.ColumnStatistics in project hive by apache.
the class MetaStoreDirectSql method makeColumnStats.
private ColumnStatistics makeColumnStats(List<Object[]> list, ColumnStatisticsDesc csd, int offset) throws MetaException {
ColumnStatistics result = new ColumnStatistics();
result.setStatsDesc(csd);
List<ColumnStatisticsObj> csos = new ArrayList<ColumnStatisticsObj>(list.size());
for (Object[] row : list) {
// LastAnalyzed is stored per column but thrift has it per several;
// get the lowest for now as nobody actually uses this field.
Object laObj = row[offset + 15];
if (laObj != null && (!csd.isSetLastAnalyzed() || csd.getLastAnalyzed() > extractSqlLong(laObj))) {
csd.setLastAnalyzed(extractSqlLong(laObj));
}
csos.add(prepareCSObj(row, offset));
Deadline.checkTimeout();
}
result.setStatsObj(csos);
return result;
}
use of org.apache.hadoop.hive.metastore.api.ColumnStatistics in project hive by apache.
the class MetaStoreDirectSql method getTableStats.
/**
* Retrieve the column statistics for the specified columns of the table. NULL
* is returned if the columns are not provided.
* @param dbName the database name of the table
* @param tableName the table name
* @param colNames the list of the column names
* @return the column statistics for the specified columns
* @throws MetaException
*/
public ColumnStatistics getTableStats(final String dbName, final String tableName, List<String> colNames, boolean enableBitVector) throws MetaException {
if (colNames == null || colNames.isEmpty()) {
return null;
}
final boolean doTrace = LOG.isDebugEnabled();
final String queryText0 = "select " + getStatsList(enableBitVector) + " from " + TAB_COL_STATS + " " + " where \"DB_NAME\" = ? and \"TABLE_NAME\" = ? and \"COLUMN_NAME\" in (";
Batchable<String, Object[]> b = new Batchable<String, Object[]>() {
@Override
public List<Object[]> run(List<String> input) throws MetaException {
String queryText = queryText0 + makeParams(input.size()) + ")";
Object[] params = new Object[input.size() + 2];
params[0] = dbName;
params[1] = tableName;
for (int i = 0; i < input.size(); ++i) {
params[i + 2] = input.get(i);
}
long start = doTrace ? System.nanoTime() : 0;
Query query = pm.newQuery("javax.jdo.query.SQL", queryText);
Object qResult = executeWithArray(query, params, queryText);
timingTrace(doTrace, queryText0 + "...)", start, (doTrace ? System.nanoTime() : 0));
if (qResult == null) {
query.closeAll();
return null;
}
addQueryAfterUse(query);
return ensureList(qResult);
}
};
List<Object[]> list = runBatched(colNames, b);
if (list.isEmpty()) {
return null;
}
ColumnStatisticsDesc csd = new ColumnStatisticsDesc(true, dbName, tableName);
ColumnStatistics result = makeColumnStats(list, csd, 0);
b.closeAllQueries();
return result;
}
Aggregations