use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData._Fields in project hive by apache.
the class StringColumnStatsAggregator method aggregate.
@Override
public ColumnStatisticsObj aggregate(String colName, List<String> partNames, List<ColumnStatistics> css) throws MetaException {
ColumnStatisticsObj statsObj = null;
// check if all the ColumnStatisticsObjs contain stats and all the ndv are
// bitvectors. Only when both of the conditions are true, we merge bit
// vectors. Otherwise, just use the maximum function.
boolean doAllPartitionContainStats = partNames.size() == css.size();
boolean isNDVBitVectorSet = true;
String colType = null;
for (ColumnStatistics cs : css) {
if (cs.getStatsObjSize() != 1) {
throw new MetaException("The number of columns should be exactly one in aggrStats, but found " + cs.getStatsObjSize());
}
ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
if (statsObj == null) {
colType = cso.getColType();
statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso.getStatsData().getSetField());
}
if (numBitVectors <= 0 || !cso.getStatsData().getStringStats().isSetBitVectors() || cso.getStatsData().getStringStats().getBitVectors().length() == 0) {
isNDVBitVectorSet = false;
break;
}
}
ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
if (doAllPartitionContainStats && isNDVBitVectorSet) {
StringColumnStatsData aggregateData = null;
NumDistinctValueEstimator ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
for (ColumnStatistics cs : css) {
ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
StringColumnStatsData newData = cso.getStatsData().getStringStats();
ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), ndvEstimator.getnumBitVectors()));
if (aggregateData == null) {
aggregateData = newData.deepCopy();
} else {
aggregateData.setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen()));
aggregateData.setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen()));
aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
}
}
aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
columnStatisticsData.setStringStats(aggregateData);
} else {
StringColumnStatsData aggregateData = null;
for (ColumnStatistics cs : css) {
ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
StringColumnStatsData newData = cso.getStatsData().getStringStats();
if (aggregateData == null) {
aggregateData = newData.deepCopy();
} else {
aggregateData.setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen()));
aggregateData.setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen()));
aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
}
}
columnStatisticsData.setStringStats(aggregateData);
}
statsObj.setStatsData(columnStatisticsData);
return statsObj;
}
use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData._Fields in project hive by apache.
the class ColumnStatsMergerFactory method getColumnStatsMerger.
public static ColumnStatsMerger getColumnStatsMerger(ColumnStatisticsObj statsObjNew, ColumnStatisticsObj statsObjOld) {
ColumnStatsMerger agg;
_Fields typeNew = statsObjNew.getStatsData().getSetField();
_Fields typeOld = statsObjOld.getStatsData().getSetField();
// make sure that they have the same type
typeNew = typeNew == typeOld ? typeNew : null;
int numBitVectors = 0;
switch(typeNew) {
case BOOLEAN_STATS:
agg = new BooleanColumnStatsMerger();
break;
case LONG_STATS:
{
agg = new LongColumnStatsMerger();
int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getLongStats().getBitVectors());
int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getLongStats().getBitVectors());
numBitVectors = nbvNew == nbvOld ? nbvNew : 0;
break;
}
case DOUBLE_STATS:
{
agg = new DoubleColumnStatsMerger();
int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getDoubleStats().getBitVectors());
int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getDoubleStats().getBitVectors());
numBitVectors = nbvNew == nbvOld ? nbvNew : 0;
break;
}
case STRING_STATS:
{
agg = new StringColumnStatsMerger();
int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getStringStats().getBitVectors());
int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getStringStats().getBitVectors());
numBitVectors = nbvNew == nbvOld ? nbvNew : 0;
break;
}
case BINARY_STATS:
agg = new BinaryColumnStatsMerger();
break;
case DECIMAL_STATS:
{
agg = new DecimalColumnStatsMerger();
int nbvNew = countNumBitVectors(statsObjNew.getStatsData().getDecimalStats().getBitVectors());
int nbvOld = countNumBitVectors(statsObjOld.getStatsData().getDecimalStats().getBitVectors());
numBitVectors = nbvNew == nbvOld ? nbvNew : 0;
break;
}
default:
throw new RuntimeException("Woh, bad. Unknown stats type " + typeNew.toString());
}
if (numBitVectors > 0) {
agg.ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
}
return agg;
}
use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData._Fields in project hive by apache.
the class TestHBaseStoreCached method booleanTableStatistics.
// Due to the way our mock stuff works, we can only insert one column at a time, so we'll test
// each stat type separately. We'll test them together in hte integration tests.
@Test
public void booleanTableStatistics() throws Exception {
long now = System.currentTimeMillis();
String dbname = "default";
String tableName = "statstable";
String boolcol = "boolcol";
int startTime = (int) (System.currentTimeMillis() / 1000);
List<FieldSchema> cols = new ArrayList<FieldSchema>();
cols.add(new FieldSchema(boolcol, "boolean", "nocomment"));
SerDeInfo serde = new SerDeInfo("serde", "seriallib", null);
StorageDescriptor sd = new StorageDescriptor(cols, "file:/tmp", "input", "output", false, 0, serde, null, null, emptyParameters);
Table table = new Table(tableName, dbname, "me", startTime, startTime, 0, sd, null, emptyParameters, null, null, null);
store.createTable(table);
long trues = 37;
long falses = 12;
long booleanNulls = 2;
ColumnStatistics stats = new ColumnStatistics();
ColumnStatisticsDesc desc = new ColumnStatisticsDesc();
desc.setLastAnalyzed(now);
desc.setDbName(dbname);
desc.setTableName(tableName);
desc.setIsTblLevel(true);
stats.setStatsDesc(desc);
ColumnStatisticsObj obj = new ColumnStatisticsObj();
obj.setColName(boolcol);
obj.setColType("boolean");
ColumnStatisticsData data = new ColumnStatisticsData();
BooleanColumnStatsData boolData = new BooleanColumnStatsData();
boolData.setNumTrues(trues);
boolData.setNumFalses(falses);
boolData.setNumNulls(booleanNulls);
data.setBooleanStats(boolData);
obj.setStatsData(data);
stats.addToStatsObj(obj);
store.updateTableColumnStatistics(stats);
stats = store.getTableColumnStatistics(dbname, tableName, Arrays.asList(boolcol));
Assert.assertEquals(now, stats.getStatsDesc().getLastAnalyzed());
Assert.assertEquals(dbname, stats.getStatsDesc().getDbName());
Assert.assertEquals(tableName, stats.getStatsDesc().getTableName());
Assert.assertTrue(stats.getStatsDesc().isIsTblLevel());
Assert.assertEquals(1, stats.getStatsObjSize());
ColumnStatisticsData colData = obj.getStatsData();
Assert.assertEquals(ColumnStatisticsData._Fields.BOOLEAN_STATS, colData.getSetField());
boolData = colData.getBooleanStats();
Assert.assertEquals(trues, boolData.getNumTrues());
Assert.assertEquals(falses, boolData.getNumFalses());
Assert.assertEquals(booleanNulls, boolData.getNumNulls());
}
use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData._Fields in project hive by apache.
the class TestHBaseStoreBitVector method mockStringStats.
private static ColumnStatisticsObj mockStringStats(int i) {
long maxLen = 1234 + 10 * i;
double avgLen = 32.3 + i;
long nulls = 987 + 10 * i;
long dVs = 906 + i;
String bitVectors = "{0, 1, 2, 3, 4, 5, 6, 7, 8}{0, 1, 3, 4, 5, 6, 7, 8}";
ColumnStatisticsObj colStatsObj = new ColumnStatisticsObj();
colStatsObj.setColName(STRING_COL);
colStatsObj.setColType(STRING_TYPE);
ColumnStatisticsData data = new ColumnStatisticsData();
StringColumnStatsData stringData = new StringColumnStatsData();
stringData.setMaxColLen(maxLen);
stringData.setAvgColLen(avgLen);
stringData.setNumNulls(nulls);
stringData.setNumDVs(dVs);
stringData.setBitVectors(bitVectors);
data.setStringStats(stringData);
colStatsObj.setStatsData(data);
return colStatsObj;
}
use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData._Fields in project hive by apache.
the class TestHBaseStoreBitVector method longPartitionStatistics.
@Test
public void longPartitionStatistics() throws Exception {
createMockTableAndPartition(INT_TYPE, INT_VAL);
// Add partition stats for: LONG_COL and partition: {PART_KEY, INT_VAL} to DB
// Because of the way our mock implementation works we actually need to not create the table
// before we set statistics on it.
ColumnStatistics stats = new ColumnStatistics();
// Get a default ColumnStatisticsDesc for partition level stats
ColumnStatisticsDesc desc = getMockPartColStatsDesc(PART_KEY, INT_VAL);
stats.setStatsDesc(desc);
// Get one of the pre-created ColumnStatisticsObj
ColumnStatisticsObj obj = longColStatsObjs.get(0);
LongColumnStatsData longData = obj.getStatsData().getLongStats();
// Add to DB
stats.addToStatsObj(obj);
List<String> parVals = new ArrayList<String>();
parVals.add(INT_VAL);
store.updatePartitionColumnStatistics(stats, parVals);
// Get from DB
List<String> partNames = new ArrayList<String>();
partNames.add(desc.getPartName());
List<String> colNames = new ArrayList<String>();
colNames.add(obj.getColName());
List<ColumnStatistics> statsFromDB = store.getPartitionColumnStatistics(DB, TBL, partNames, colNames);
// Compare ColumnStatisticsDesc
Assert.assertEquals(1, statsFromDB.size());
Assert.assertEquals(desc.getLastAnalyzed(), statsFromDB.get(0).getStatsDesc().getLastAnalyzed());
Assert.assertEquals(DB, statsFromDB.get(0).getStatsDesc().getDbName());
Assert.assertEquals(TBL, statsFromDB.get(0).getStatsDesc().getTableName());
Assert.assertFalse(statsFromDB.get(0).getStatsDesc().isIsTblLevel());
// Compare ColumnStatisticsObj
Assert.assertEquals(1, statsFromDB.get(0).getStatsObjSize());
ColumnStatisticsObj objFromDB = statsFromDB.get(0).getStatsObj().get(0);
ColumnStatisticsData dataFromDB = objFromDB.getStatsData();
// Compare ColumnStatisticsData
Assert.assertEquals(ColumnStatisticsData._Fields.LONG_STATS, dataFromDB.getSetField());
// Compare LongColumnStatsData
LongColumnStatsData longDataFromDB = dataFromDB.getLongStats();
Assert.assertEquals(longData.getHighValue(), longDataFromDB.getHighValue());
Assert.assertEquals(longData.getLowValue(), longDataFromDB.getLowValue());
Assert.assertEquals(longData.getNumNulls(), longDataFromDB.getNumNulls());
Assert.assertEquals(longData.getNumDVs(), longDataFromDB.getNumDVs());
Assert.assertEquals(longData.getBitVectors(), longDataFromDB.getBitVectors());
}
Aggregations