use of org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData in project hive by apache.
the class TestHBaseStore method decimalPartitionStatistics.
@Test
public void decimalPartitionStatistics() throws Exception {
createMockTableAndPartition(DECIMAL_TYPE, DECIMAL_VAL);
// Add partition stats for: DECIMAL_COL and partition: {PART_KEY, DECIMAL_VAL} to DB
// Because of the way our mock implementation works we actually need to not create the table
// before we set statistics on it.
ColumnStatistics stats = new ColumnStatistics();
// Get a default ColumnStatisticsDesc for partition level stats
ColumnStatisticsDesc desc = getMockPartColStatsDesc(PART_KEY, DECIMAL_VAL);
stats.setStatsDesc(desc);
// Get one of the pre-created ColumnStatisticsObj
ColumnStatisticsObj obj = decimalColStatsObjs.get(0);
DecimalColumnStatsData decimalData = obj.getStatsData().getDecimalStats();
// Add to DB
stats.addToStatsObj(obj);
List<String> parVals = new ArrayList<String>();
parVals.add(DECIMAL_VAL);
store.updatePartitionColumnStatistics(stats, parVals);
// Get from DB
List<String> partNames = new ArrayList<String>();
partNames.add(desc.getPartName());
List<String> colNames = new ArrayList<String>();
colNames.add(obj.getColName());
List<ColumnStatistics> statsFromDB = store.getPartitionColumnStatistics(DB, TBL, partNames, colNames);
// Compare ColumnStatisticsDesc
Assert.assertEquals(1, statsFromDB.size());
Assert.assertEquals(desc.getLastAnalyzed(), statsFromDB.get(0).getStatsDesc().getLastAnalyzed());
Assert.assertEquals(DB, statsFromDB.get(0).getStatsDesc().getDbName());
Assert.assertEquals(TBL, statsFromDB.get(0).getStatsDesc().getTableName());
Assert.assertFalse(statsFromDB.get(0).getStatsDesc().isIsTblLevel());
// Compare ColumnStatisticsObj
Assert.assertEquals(1, statsFromDB.get(0).getStatsObjSize());
ColumnStatisticsObj objFromDB = statsFromDB.get(0).getStatsObj().get(0);
ColumnStatisticsData dataFromDB = objFromDB.getStatsData();
// Compare ColumnStatisticsData
Assert.assertEquals(ColumnStatisticsData._Fields.DECIMAL_STATS, dataFromDB.getSetField());
// Compare DecimalColumnStatsData
DecimalColumnStatsData decimalDataFromDB = dataFromDB.getDecimalStats();
Assert.assertEquals(decimalData.getHighValue(), decimalDataFromDB.getHighValue());
Assert.assertEquals(decimalData.getLowValue(), decimalDataFromDB.getLowValue());
Assert.assertEquals(decimalData.getNumNulls(), decimalDataFromDB.getNumNulls());
Assert.assertEquals(decimalData.getNumDVs(), decimalDataFromDB.getNumDVs());
}
use of org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData in project hive by apache.
the class TestHBaseStore method mockDecimalStats.
private static ColumnStatisticsObj mockDecimalStats(int i) {
Decimal high = new Decimal();
high.setScale((short) 3);
String strHigh = String.valueOf(3876 + 100 * i);
high.setUnscaled(strHigh.getBytes());
Decimal low = new Decimal();
low.setScale((short) 3);
String strLow = String.valueOf(38 + i);
low.setUnscaled(strLow.getBytes());
long nulls = 13 + i;
long dVs = 923947293L + 100 * i;
ColumnStatisticsObj colStatsObj = new ColumnStatisticsObj();
colStatsObj.setColName(DECIMAL_COL);
colStatsObj.setColType(DECIMAL_TYPE);
ColumnStatisticsData data = new ColumnStatisticsData();
DecimalColumnStatsData decimalData = new DecimalColumnStatsData();
decimalData.setHighValue(high);
decimalData.setLowValue(low);
decimalData.setNumNulls(nulls);
decimalData.setNumDVs(dVs);
data.setDecimalStats(decimalData);
colStatsObj.setStatsData(data);
return colStatsObj;
}
use of org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData in project hive by apache.
the class TestHBaseAggregateStatsExtrapolation method allPartitionsHaveBitVectorStatusDecimal.
@Test
public void allPartitionsHaveBitVectorStatusDecimal() throws Exception {
String dbName = "default";
String tableName = "snp";
long now = System.currentTimeMillis();
List<FieldSchema> cols = new ArrayList<>();
cols.add(new FieldSchema("col1_decimal", "decimal", "nocomment"));
SerDeInfo serde = new SerDeInfo("serde", "seriallib", null);
StorageDescriptor sd = new StorageDescriptor(cols, "file:/tmp", "input", "output", false, 0, serde, null, null, Collections.<String, String>emptyMap());
List<FieldSchema> partCols = new ArrayList<>();
partCols.add(new FieldSchema("ds", "string", ""));
Table table = new Table(tableName, dbName, "me", (int) now, (int) now, 0, sd, partCols, Collections.<String, String>emptyMap(), null, null, null);
store.createTable(table);
List<List<String>> partVals = new ArrayList<>();
for (int i = 0; i < 10; i++) {
List<String> partVal = Arrays.asList("" + i);
partVals.add(partVal);
StorageDescriptor psd = new StorageDescriptor(sd);
psd.setLocation("file:/tmp/default/hit/ds=" + partVal);
Partition part = new Partition(partVal, dbName, tableName, (int) now, (int) now, psd, Collections.<String, String>emptyMap());
store.addPartition(part);
ColumnStatistics cs = new ColumnStatistics();
ColumnStatisticsDesc desc = new ColumnStatisticsDesc(false, dbName, tableName);
desc.setLastAnalyzed(now);
desc.setPartName("ds=" + partVal);
cs.setStatsDesc(desc);
ColumnStatisticsObj obj = new ColumnStatisticsObj();
obj.setColName("col1_decimal");
obj.setColType("decimal");
ColumnStatisticsData data = new ColumnStatisticsData();
DecimalColumnStatsData dcsd = new DecimalColumnStatsData();
dcsd.setHighValue(StatObjectConverter.createThriftDecimal("" + (1000 + i)));
dcsd.setLowValue(StatObjectConverter.createThriftDecimal("" + (-1000 - i)));
dcsd.setNumNulls(i);
dcsd.setNumDVs(10 * i + 1);
dcsd.setBitVectors(bitVectors);
data.setDecimalStats(dcsd);
obj.setStatsData(data);
cs.addToStatsObj(obj);
store.updatePartitionColumnStatistics(cs, partVal);
}
Checker statChecker = new Checker() {
@Override
public void checkStats(AggrStats aggrStats) throws Exception {
Assert.assertEquals(10, aggrStats.getPartsFound());
Assert.assertEquals(1, aggrStats.getColStatsSize());
ColumnStatisticsObj cso = aggrStats.getColStats().get(0);
Assert.assertEquals("col1_decimal", cso.getColName());
Assert.assertEquals("decimal", cso.getColType());
DecimalColumnStatsData lcsd = cso.getStatsData().getDecimalStats();
Assert.assertEquals(1009, HBaseUtils.getDoubleValue(lcsd.getHighValue()), 0.01);
Assert.assertEquals(-1009, HBaseUtils.getDoubleValue(lcsd.getLowValue()), 0.01);
Assert.assertEquals(45, lcsd.getNumNulls());
Assert.assertEquals(3, lcsd.getNumDVs());
}
};
List<String> partNames = new ArrayList<>();
for (int i = 0; i < 10; i++) {
partNames.add("ds=" + i);
}
AggrStats aggrStats = store.get_aggr_stats_for(dbName, tableName, partNames, Arrays.asList("col1_decimal"));
statChecker.checkStats(aggrStats);
}
use of org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData in project hive by apache.
the class ColumnStatsAggregatorFactory method newColumnStaticsObj.
public static ColumnStatisticsObj newColumnStaticsObj(String colName, String colType, _Fields type) {
ColumnStatisticsObj cso = new ColumnStatisticsObj();
ColumnStatisticsData csd = new ColumnStatisticsData();
cso.setColName(colName);
cso.setColType(colType);
switch(type) {
case BOOLEAN_STATS:
csd.setBooleanStats(new BooleanColumnStatsData());
break;
case LONG_STATS:
csd.setLongStats(new LongColumnStatsData());
break;
case DOUBLE_STATS:
csd.setDoubleStats(new DoubleColumnStatsData());
break;
case STRING_STATS:
csd.setStringStats(new StringColumnStatsData());
break;
case BINARY_STATS:
csd.setBinaryStats(new BinaryColumnStatsData());
break;
case DECIMAL_STATS:
csd.setDecimalStats(new DecimalColumnStatsData());
break;
default:
throw new RuntimeException("Woh, bad. Unknown stats type!");
}
cso.setStatsData(csd);
return cso;
}
use of org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData in project hive by apache.
the class DecimalColumnStatsAggregator method extrapolate.
@Override
public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, int numPartsWithStats, Map<String, Double> adjustedIndexMap, Map<String, ColumnStatisticsData> adjustedStatsMap, double densityAvg) {
int rightBorderInd = numParts;
DecimalColumnStatsData extrapolateDecimalData = new DecimalColumnStatsData();
Map<String, DecimalColumnStatsData> extractedAdjustedStatsMap = new HashMap<>();
for (Map.Entry<String, ColumnStatisticsData> entry : adjustedStatsMap.entrySet()) {
extractedAdjustedStatsMap.put(entry.getKey(), entry.getValue().getDecimalStats());
}
List<Map.Entry<String, DecimalColumnStatsData>> list = new LinkedList<Map.Entry<String, DecimalColumnStatsData>>(extractedAdjustedStatsMap.entrySet());
// get the lowValue
Collections.sort(list, new Comparator<Map.Entry<String, DecimalColumnStatsData>>() {
public int compare(Map.Entry<String, DecimalColumnStatsData> o1, Map.Entry<String, DecimalColumnStatsData> o2) {
return o1.getValue().getLowValue().compareTo(o2.getValue().getLowValue());
}
});
double minInd = adjustedIndexMap.get(list.get(0).getKey());
double maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
double lowValue = 0;
double min = HBaseUtils.getDoubleValue(list.get(0).getValue().getLowValue());
double max = HBaseUtils.getDoubleValue(list.get(list.size() - 1).getValue().getLowValue());
if (minInd == maxInd) {
lowValue = min;
} else if (minInd < maxInd) {
// left border is the min
lowValue = (max - (max - min) * maxInd / (maxInd - minInd));
} else {
// right border is the min
lowValue = (max - (max - min) * (rightBorderInd - maxInd) / (minInd - maxInd));
}
// get the highValue
Collections.sort(list, new Comparator<Map.Entry<String, DecimalColumnStatsData>>() {
public int compare(Map.Entry<String, DecimalColumnStatsData> o1, Map.Entry<String, DecimalColumnStatsData> o2) {
return o1.getValue().getHighValue().compareTo(o2.getValue().getHighValue());
}
});
minInd = adjustedIndexMap.get(list.get(0).getKey());
maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
double highValue = 0;
min = HBaseUtils.getDoubleValue(list.get(0).getValue().getHighValue());
max = HBaseUtils.getDoubleValue(list.get(list.size() - 1).getValue().getHighValue());
if (minInd == maxInd) {
highValue = min;
} else if (minInd < maxInd) {
// right border is the max
highValue = (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd));
} else {
// left border is the max
highValue = (min + (max - min) * minInd / (minInd - maxInd));
}
// get the #nulls
long numNulls = 0;
for (Map.Entry<String, DecimalColumnStatsData> entry : extractedAdjustedStatsMap.entrySet()) {
numNulls += entry.getValue().getNumNulls();
}
// we scale up sumNulls based on the number of partitions
numNulls = numNulls * numParts / numPartsWithStats;
// get the ndv
long ndv = 0;
long ndvMin = 0;
long ndvMax = 0;
Collections.sort(list, new Comparator<Map.Entry<String, DecimalColumnStatsData>>() {
public int compare(Map.Entry<String, DecimalColumnStatsData> o1, Map.Entry<String, DecimalColumnStatsData> o2) {
return o1.getValue().getNumDVs() < o2.getValue().getNumDVs() ? -1 : 1;
}
});
long lowerBound = list.get(list.size() - 1).getValue().getNumDVs();
long higherBound = 0;
for (Map.Entry<String, DecimalColumnStatsData> entry : list) {
higherBound += entry.getValue().getNumDVs();
}
if (useDensityFunctionForNDVEstimation && densityAvg != 0.0) {
ndv = (long) ((highValue - lowValue) / densityAvg);
if (ndv < lowerBound) {
ndv = lowerBound;
} else if (ndv > higherBound) {
ndv = higherBound;
}
} else {
minInd = adjustedIndexMap.get(list.get(0).getKey());
maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
ndvMin = list.get(0).getValue().getNumDVs();
ndvMax = list.get(list.size() - 1).getValue().getNumDVs();
if (minInd == maxInd) {
ndv = ndvMin;
} else if (minInd < maxInd) {
// right border is the max
ndv = (long) (ndvMin + (ndvMax - ndvMin) * (rightBorderInd - minInd) / (maxInd - minInd));
} else {
// left border is the max
ndv = (long) (ndvMin + (ndvMax - ndvMin) * minInd / (minInd - maxInd));
}
}
extrapolateDecimalData.setLowValue(StatObjectConverter.createThriftDecimal(String.valueOf(lowValue)));
extrapolateDecimalData.setHighValue(StatObjectConverter.createThriftDecimal(String.valueOf(highValue)));
extrapolateDecimalData.setNumNulls(numNulls);
extrapolateDecimalData.setNumDVs(ndv);
extrapolateData.setDecimalStats(extrapolateDecimalData);
}
Aggregations