use of org.apache.hadoop.hive.metastore.api.StringColumnStatsData in project hive by apache.
the class TestHBaseStore method stringPartitionStatistics.
@Test
public void stringPartitionStatistics() throws Exception {
createMockTableAndPartition(STRING_TYPE, STRING_VAL);
// Add partition stats for: STRING_COL and partition: {PART_KEY, STRING_VAL} to DB
// Because of the way our mock implementation works we actually need to not create the table
// before we set statistics on it.
ColumnStatistics stats = new ColumnStatistics();
// Get a default ColumnStatisticsDesc for partition level stats
ColumnStatisticsDesc desc = getMockPartColStatsDesc(PART_KEY, STRING_VAL);
stats.setStatsDesc(desc);
// Get one of the pre-created ColumnStatisticsObj
ColumnStatisticsObj obj = stringColStatsObjs.get(0);
StringColumnStatsData stringData = obj.getStatsData().getStringStats();
// Add to DB
stats.addToStatsObj(obj);
List<String> parVals = new ArrayList<String>();
parVals.add(STRING_VAL);
store.updatePartitionColumnStatistics(stats, parVals);
// Get from DB
List<String> partNames = new ArrayList<String>();
partNames.add(desc.getPartName());
List<String> colNames = new ArrayList<String>();
colNames.add(obj.getColName());
List<ColumnStatistics> statsFromDB = store.getPartitionColumnStatistics(DB, TBL, partNames, colNames);
// Compare ColumnStatisticsDesc
Assert.assertEquals(1, statsFromDB.size());
Assert.assertEquals(desc.getLastAnalyzed(), statsFromDB.get(0).getStatsDesc().getLastAnalyzed());
Assert.assertEquals(DB, statsFromDB.get(0).getStatsDesc().getDbName());
Assert.assertEquals(TBL, statsFromDB.get(0).getStatsDesc().getTableName());
Assert.assertFalse(statsFromDB.get(0).getStatsDesc().isIsTblLevel());
// Compare ColumnStatisticsObj
Assert.assertEquals(1, statsFromDB.get(0).getStatsObjSize());
ColumnStatisticsObj objFromDB = statsFromDB.get(0).getStatsObj().get(0);
ColumnStatisticsData dataFromDB = objFromDB.getStatsData();
// Compare ColumnStatisticsData
Assert.assertEquals(ColumnStatisticsData._Fields.STRING_STATS, dataFromDB.getSetField());
// Compare StringColumnStatsData
StringColumnStatsData stringDataFromDB = dataFromDB.getStringStats();
Assert.assertEquals(stringData.getMaxColLen(), stringDataFromDB.getMaxColLen());
Assert.assertEquals(stringData.getAvgColLen(), stringDataFromDB.getAvgColLen(), 0.01);
Assert.assertEquals(stringData.getNumNulls(), stringDataFromDB.getNumNulls());
Assert.assertEquals(stringData.getNumDVs(), stringDataFromDB.getNumDVs());
}
use of org.apache.hadoop.hive.metastore.api.StringColumnStatsData in project hive by apache.
the class MetaDataFormatUtils method formatWithIndentation.
private static void formatWithIndentation(String colName, String colType, String colComment, StringBuilder tableInfo, List<ColumnStatisticsObj> colStats) {
tableInfo.append(String.format("%-" + ALIGNMENT + "s", colName)).append(FIELD_DELIM);
tableInfo.append(String.format("%-" + ALIGNMENT + "s", colType)).append(FIELD_DELIM);
if (colStats != null) {
ColumnStatisticsObj cso = getColumnStatisticsObject(colName, colType, colStats);
if (cso != null) {
ColumnStatisticsData csd = cso.getStatsData();
if (csd.isSetBinaryStats()) {
BinaryColumnStatsData bcsd = csd.getBinaryStats();
appendColumnStats(tableInfo, "", "", bcsd.getNumNulls(), "", bcsd.getAvgColLen(), bcsd.getMaxColLen(), "", "");
} else if (csd.isSetStringStats()) {
StringColumnStatsData scsd = csd.getStringStats();
appendColumnStats(tableInfo, "", "", scsd.getNumNulls(), scsd.getNumDVs(), scsd.getAvgColLen(), scsd.getMaxColLen(), "", "");
} else if (csd.isSetBooleanStats()) {
BooleanColumnStatsData bcsd = csd.getBooleanStats();
appendColumnStats(tableInfo, "", "", bcsd.getNumNulls(), "", "", "", bcsd.getNumTrues(), bcsd.getNumFalses());
} else if (csd.isSetDecimalStats()) {
DecimalColumnStatsData dcsd = csd.getDecimalStats();
appendColumnStats(tableInfo, convertToString(dcsd.getLowValue()), convertToString(dcsd.getHighValue()), dcsd.getNumNulls(), dcsd.getNumDVs(), "", "", "", "");
} else if (csd.isSetDoubleStats()) {
DoubleColumnStatsData dcsd = csd.getDoubleStats();
appendColumnStats(tableInfo, dcsd.getLowValue(), dcsd.getHighValue(), dcsd.getNumNulls(), dcsd.getNumDVs(), "", "", "", "");
} else if (csd.isSetLongStats()) {
LongColumnStatsData lcsd = csd.getLongStats();
appendColumnStats(tableInfo, lcsd.getLowValue(), lcsd.getHighValue(), lcsd.getNumNulls(), lcsd.getNumDVs(), "", "", "", "");
} else if (csd.isSetDateStats()) {
DateColumnStatsData dcsd = csd.getDateStats();
appendColumnStats(tableInfo, convertToString(dcsd.getLowValue()), convertToString(dcsd.getHighValue()), dcsd.getNumNulls(), dcsd.getNumDVs(), "", "", "", "");
}
} else {
appendColumnStats(tableInfo, "", "", "", "", "", "", "", "");
}
}
int colNameLength = ALIGNMENT > colName.length() ? ALIGNMENT : colName.length();
int colTypeLength = ALIGNMENT > colType.length() ? ALIGNMENT : colType.length();
indentMultilineValue(colComment, tableInfo, new int[] { colNameLength, colTypeLength }, false);
}
use of org.apache.hadoop.hive.metastore.api.StringColumnStatsData in project hive by apache.
the class MetaDataFormatUtils method formatWithoutIndentation.
private static void formatWithoutIndentation(String name, String type, String comment, StringBuilder colBuffer, List<ColumnStatisticsObj> colStats) {
colBuffer.append(name);
colBuffer.append(FIELD_DELIM);
colBuffer.append(type);
colBuffer.append(FIELD_DELIM);
if (colStats != null) {
ColumnStatisticsObj cso = getColumnStatisticsObject(name, type, colStats);
if (cso != null) {
ColumnStatisticsData csd = cso.getStatsData();
if (csd.isSetBinaryStats()) {
BinaryColumnStatsData bcsd = csd.getBinaryStats();
appendColumnStatsNoFormatting(colBuffer, "", "", bcsd.getNumNulls(), "", bcsd.getAvgColLen(), bcsd.getMaxColLen(), "", "");
} else if (csd.isSetStringStats()) {
StringColumnStatsData scsd = csd.getStringStats();
appendColumnStatsNoFormatting(colBuffer, "", "", scsd.getNumNulls(), scsd.getNumDVs(), scsd.getAvgColLen(), scsd.getMaxColLen(), "", "");
} else if (csd.isSetBooleanStats()) {
BooleanColumnStatsData bcsd = csd.getBooleanStats();
appendColumnStatsNoFormatting(colBuffer, "", "", bcsd.getNumNulls(), "", "", "", bcsd.getNumTrues(), bcsd.getNumFalses());
} else if (csd.isSetDecimalStats()) {
DecimalColumnStatsData dcsd = csd.getDecimalStats();
appendColumnStatsNoFormatting(colBuffer, convertToString(dcsd.getLowValue()), convertToString(dcsd.getHighValue()), dcsd.getNumNulls(), dcsd.getNumDVs(), "", "", "", "");
} else if (csd.isSetDoubleStats()) {
DoubleColumnStatsData dcsd = csd.getDoubleStats();
appendColumnStatsNoFormatting(colBuffer, dcsd.getLowValue(), dcsd.getHighValue(), dcsd.getNumNulls(), dcsd.getNumDVs(), "", "", "", "");
} else if (csd.isSetLongStats()) {
LongColumnStatsData lcsd = csd.getLongStats();
appendColumnStatsNoFormatting(colBuffer, lcsd.getLowValue(), lcsd.getHighValue(), lcsd.getNumNulls(), lcsd.getNumDVs(), "", "", "", "");
} else if (csd.isSetDateStats()) {
DateColumnStatsData dcsd = csd.getDateStats();
appendColumnStatsNoFormatting(colBuffer, convertToString(dcsd.getLowValue()), convertToString(dcsd.getHighValue()), dcsd.getNumNulls(), dcsd.getNumDVs(), "", "", "", "");
}
} else {
appendColumnStatsNoFormatting(colBuffer, "", "", "", "", "", "", "", "");
}
}
colBuffer.append(comment == null ? "" : HiveStringUtils.escapeJava(comment));
colBuffer.append(LINE_DELIM);
}
use of org.apache.hadoop.hive.metastore.api.StringColumnStatsData in project hive by apache.
the class StatObjectConverter method fillColumnStatisticsData.
public static void fillColumnStatisticsData(String colType, ColumnStatisticsData data, Object llow, Object lhigh, Object dlow, Object dhigh, Object declow, Object dechigh, Object nulls, Object dist, Object avglen, Object maxlen, Object trues, Object falses, Object avgLong, Object avgDouble, Object avgDecimal, Object sumDist, boolean useDensityFunctionForNDVEstimation) throws MetaException {
colType = colType.toLowerCase();
if (colType.equals("boolean")) {
BooleanColumnStatsData boolStats = new BooleanColumnStatsData();
boolStats.setNumFalses(MetaStoreDirectSql.extractSqlLong(falses));
boolStats.setNumTrues(MetaStoreDirectSql.extractSqlLong(trues));
boolStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls));
data.setBooleanStats(boolStats);
} else if (colType.equals("string") || colType.startsWith("varchar") || colType.startsWith("char")) {
StringColumnStatsData stringStats = new StringColumnStatsData();
stringStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls));
stringStats.setAvgColLen(MetaStoreDirectSql.extractSqlDouble(avglen));
stringStats.setMaxColLen(MetaStoreDirectSql.extractSqlLong(maxlen));
stringStats.setNumDVs(MetaStoreDirectSql.extractSqlLong(dist));
data.setStringStats(stringStats);
} else if (colType.equals("binary")) {
BinaryColumnStatsData binaryStats = new BinaryColumnStatsData();
binaryStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls));
binaryStats.setAvgColLen(MetaStoreDirectSql.extractSqlDouble(avglen));
binaryStats.setMaxColLen(MetaStoreDirectSql.extractSqlLong(maxlen));
data.setBinaryStats(binaryStats);
} else if (colType.equals("bigint") || colType.equals("int") || colType.equals("smallint") || colType.equals("tinyint") || colType.equals("timestamp")) {
LongColumnStatsData longStats = new LongColumnStatsData();
longStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls));
if (lhigh != null) {
longStats.setHighValue(MetaStoreDirectSql.extractSqlLong(lhigh));
}
if (llow != null) {
longStats.setLowValue(MetaStoreDirectSql.extractSqlLong(llow));
}
long lowerBound = MetaStoreDirectSql.extractSqlLong(dist);
long higherBound = MetaStoreDirectSql.extractSqlLong(sumDist);
if (useDensityFunctionForNDVEstimation && lhigh != null && llow != null && avgLong != null && MetaStoreDirectSql.extractSqlDouble(avgLong) != 0.0) {
// We have estimation, lowerbound and higherbound. We use estimation if
// it is between lowerbound and higherbound.
long estimation = MetaStoreDirectSql.extractSqlLong((MetaStoreDirectSql.extractSqlLong(lhigh) - MetaStoreDirectSql.extractSqlLong(llow)) / MetaStoreDirectSql.extractSqlDouble(avgLong));
if (estimation < lowerBound) {
longStats.setNumDVs(lowerBound);
} else if (estimation > higherBound) {
longStats.setNumDVs(higherBound);
} else {
longStats.setNumDVs(estimation);
}
} else {
longStats.setNumDVs(lowerBound);
}
data.setLongStats(longStats);
} else if (colType.equals("date")) {
DateColumnStatsData dateStats = new DateColumnStatsData();
dateStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls));
if (lhigh != null) {
dateStats.setHighValue(new Date(MetaStoreDirectSql.extractSqlLong(lhigh)));
}
if (llow != null) {
dateStats.setLowValue(new Date(MetaStoreDirectSql.extractSqlLong(llow)));
}
long lowerBound = MetaStoreDirectSql.extractSqlLong(dist);
long higherBound = MetaStoreDirectSql.extractSqlLong(sumDist);
if (useDensityFunctionForNDVEstimation && lhigh != null && llow != null && avgLong != null && MetaStoreDirectSql.extractSqlDouble(avgLong) != 0.0) {
// We have estimation, lowerbound and higherbound. We use estimation if
// it is between lowerbound and higherbound.
long estimation = MetaStoreDirectSql.extractSqlLong((MetaStoreDirectSql.extractSqlLong(lhigh) - MetaStoreDirectSql.extractSqlLong(llow)) / MetaStoreDirectSql.extractSqlDouble(avgLong));
if (estimation < lowerBound) {
dateStats.setNumDVs(lowerBound);
} else if (estimation > higherBound) {
dateStats.setNumDVs(higherBound);
} else {
dateStats.setNumDVs(estimation);
}
} else {
dateStats.setNumDVs(lowerBound);
}
data.setDateStats(dateStats);
} else if (colType.equals("double") || colType.equals("float")) {
DoubleColumnStatsData doubleStats = new DoubleColumnStatsData();
doubleStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls));
if (dhigh != null) {
doubleStats.setHighValue(MetaStoreDirectSql.extractSqlDouble(dhigh));
}
if (dlow != null) {
doubleStats.setLowValue(MetaStoreDirectSql.extractSqlDouble(dlow));
}
long lowerBound = MetaStoreDirectSql.extractSqlLong(dist);
long higherBound = MetaStoreDirectSql.extractSqlLong(sumDist);
if (useDensityFunctionForNDVEstimation && dhigh != null && dlow != null && avgDouble != null && MetaStoreDirectSql.extractSqlDouble(avgDouble) != 0.0) {
long estimation = MetaStoreDirectSql.extractSqlLong((MetaStoreDirectSql.extractSqlLong(dhigh) - MetaStoreDirectSql.extractSqlLong(dlow)) / MetaStoreDirectSql.extractSqlDouble(avgDouble));
if (estimation < lowerBound) {
doubleStats.setNumDVs(lowerBound);
} else if (estimation > higherBound) {
doubleStats.setNumDVs(higherBound);
} else {
doubleStats.setNumDVs(estimation);
}
} else {
doubleStats.setNumDVs(lowerBound);
}
data.setDoubleStats(doubleStats);
} else if (colType.startsWith("decimal")) {
DecimalColumnStatsData decimalStats = new DecimalColumnStatsData();
decimalStats.setNumNulls(MetaStoreDirectSql.extractSqlLong(nulls));
Decimal low = null;
Decimal high = null;
BigDecimal blow = null;
BigDecimal bhigh = null;
if (dechigh instanceof BigDecimal) {
bhigh = (BigDecimal) dechigh;
high = new Decimal(ByteBuffer.wrap(bhigh.unscaledValue().toByteArray()), (short) bhigh.scale());
} else if (dechigh instanceof String) {
bhigh = new BigDecimal((String) dechigh);
high = createThriftDecimal((String) dechigh);
}
decimalStats.setHighValue(high);
if (declow instanceof BigDecimal) {
blow = (BigDecimal) declow;
low = new Decimal(ByteBuffer.wrap(blow.unscaledValue().toByteArray()), (short) blow.scale());
} else if (dechigh instanceof String) {
blow = new BigDecimal((String) declow);
low = createThriftDecimal((String) declow);
}
decimalStats.setLowValue(low);
long lowerBound = MetaStoreDirectSql.extractSqlLong(dist);
long higherBound = MetaStoreDirectSql.extractSqlLong(sumDist);
if (useDensityFunctionForNDVEstimation && dechigh != null && declow != null && avgDecimal != null && MetaStoreDirectSql.extractSqlDouble(avgDecimal) != 0.0) {
long estimation = MetaStoreDirectSql.extractSqlLong(MetaStoreDirectSql.extractSqlLong(bhigh.subtract(blow).floatValue() / MetaStoreDirectSql.extractSqlDouble(avgDecimal)));
if (estimation < lowerBound) {
decimalStats.setNumDVs(lowerBound);
} else if (estimation > higherBound) {
decimalStats.setNumDVs(higherBound);
} else {
decimalStats.setNumDVs(estimation);
}
} else {
decimalStats.setNumDVs(lowerBound);
}
data.setDecimalStats(decimalStats);
}
}
use of org.apache.hadoop.hive.metastore.api.StringColumnStatsData in project hive by apache.
the class TestHBaseAggrStatsCacheIntegration method hit.
@Test
public void hit() throws Exception {
String dbName = "default";
String tableName = "hit";
List<String> partVals1 = Arrays.asList("today");
List<String> partVals2 = Arrays.asList("yesterday");
long now = System.currentTimeMillis();
List<FieldSchema> cols = new ArrayList<>();
cols.add(new FieldSchema("col1", "boolean", "nocomment"));
cols.add(new FieldSchema("col2", "varchar", "nocomment"));
SerDeInfo serde = new SerDeInfo("serde", "seriallib", null);
StorageDescriptor sd = new StorageDescriptor(cols, "file:/tmp", "input", "output", false, 0, serde, null, null, Collections.<String, String>emptyMap());
List<FieldSchema> partCols = new ArrayList<>();
partCols.add(new FieldSchema("ds", "string", ""));
Table table = new Table(tableName, dbName, "me", (int) now, (int) now, 0, sd, partCols, Collections.<String, String>emptyMap(), null, null, null);
store.createTable(table);
for (List<String> partVals : Arrays.asList(partVals1, partVals2)) {
StorageDescriptor psd = new StorageDescriptor(sd);
psd.setLocation("file:/tmp/default/hit/ds=" + partVals.get(0));
Partition part = new Partition(partVals, dbName, tableName, (int) now, (int) now, psd, Collections.<String, String>emptyMap());
store.addPartition(part);
ColumnStatistics cs = new ColumnStatistics();
ColumnStatisticsDesc desc = new ColumnStatisticsDesc(false, dbName, tableName);
desc.setLastAnalyzed(now);
desc.setPartName("ds=" + partVals.get(0));
cs.setStatsDesc(desc);
ColumnStatisticsObj obj = new ColumnStatisticsObj();
obj.setColName("col1");
obj.setColType("boolean");
ColumnStatisticsData data = new ColumnStatisticsData();
BooleanColumnStatsData bcsd = new BooleanColumnStatsData();
bcsd.setNumFalses(10);
bcsd.setNumTrues(20);
bcsd.setNumNulls(30);
data.setBooleanStats(bcsd);
obj.setStatsData(data);
cs.addToStatsObj(obj);
obj = new ColumnStatisticsObj();
obj.setColName("col2");
obj.setColType("varchar");
data = new ColumnStatisticsData();
StringColumnStatsData scsd = new StringColumnStatsData();
scsd.setAvgColLen(10.3);
scsd.setMaxColLen(2000);
scsd.setNumNulls(3);
scsd.setNumDVs(12342);
data.setStringStats(scsd);
obj.setStatsData(data);
cs.addToStatsObj(obj);
store.updatePartitionColumnStatistics(cs, partVals);
}
Checker statChecker = new Checker() {
@Override
public void checkStats(AggrStats aggrStats) throws Exception {
Assert.assertEquals(2, aggrStats.getPartsFound());
Assert.assertEquals(2, aggrStats.getColStatsSize());
ColumnStatisticsObj cso = aggrStats.getColStats().get(0);
Assert.assertEquals("col1", cso.getColName());
Assert.assertEquals("boolean", cso.getColType());
BooleanColumnStatsData bcsd = cso.getStatsData().getBooleanStats();
Assert.assertEquals(20, bcsd.getNumFalses());
Assert.assertEquals(40, bcsd.getNumTrues());
Assert.assertEquals(60, bcsd.getNumNulls());
cso = aggrStats.getColStats().get(1);
Assert.assertEquals("col2", cso.getColName());
Assert.assertEquals("varchar", cso.getColType());
StringColumnStatsData scsd = cso.getStatsData().getStringStats();
Assert.assertEquals(10.3, scsd.getAvgColLen(), 0.1);
Assert.assertEquals(2000, scsd.getMaxColLen());
Assert.assertEquals(6, scsd.getNumNulls());
Assert.assertEquals(12342, scsd.getNumDVs());
}
};
AggrStats aggrStats = store.get_aggr_stats_for(dbName, tableName, Arrays.asList("ds=today", "ds=yesterday"), Arrays.asList("col1", "col2"));
statChecker.checkStats(aggrStats);
// Check that we had to build it from the stats
Assert.assertEquals(0, store.backdoor().getStatsCache().hbaseHits.getCnt());
Assert.assertEquals(2, store.backdoor().getStatsCache().totalGets.getCnt());
Assert.assertEquals(2, store.backdoor().getStatsCache().misses.getCnt());
// Call again, this time it should come from memory. Also, reverse the name order this time
// to assure that we still hit.
aggrStats = store.get_aggr_stats_for(dbName, tableName, Arrays.asList("ds=yesterday", "ds=today"), Arrays.asList("col1", "col2"));
statChecker.checkStats(aggrStats);
Assert.assertEquals(0, store.backdoor().getStatsCache().hbaseHits.getCnt());
Assert.assertEquals(4, store.backdoor().getStatsCache().totalGets.getCnt());
Assert.assertEquals(2, store.backdoor().getStatsCache().misses.getCnt());
store.backdoor().getStatsCache().flushMemory();
// Call again, this time it should come from hbase
aggrStats = store.get_aggr_stats_for(dbName, tableName, Arrays.asList("ds=today", "ds=yesterday"), Arrays.asList("col1", "col2"));
statChecker.checkStats(aggrStats);
Assert.assertEquals(2, store.backdoor().getStatsCache().hbaseHits.getCnt());
Assert.assertEquals(6, store.backdoor().getStatsCache().totalGets.getCnt());
Assert.assertEquals(2, store.backdoor().getStatsCache().misses.getCnt());
}
Aggregations