use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.
the class DateColumnStatsMergerTest method testMergeNonNullAndNullLowerValuesNewIsNull.
@Test
public void testMergeNonNullAndNullLowerValuesNewIsNull() {
ColumnStatisticsObj oldObj = new ColumnStatisticsObj();
createData(oldObj, DATE_2, DATE_2);
ColumnStatisticsObj newObj;
newObj = new ColumnStatisticsObj();
createData(newObj, DATE_3, DATE_3);
merger.merge(oldObj, newObj);
newObj = new ColumnStatisticsObj();
createData(newObj, DATE_1, DATE_1);
merger.merge(oldObj, newObj);
Assert.assertEquals(DATE_1, oldObj.getStatsData().getDateStats().getLowValue());
Assert.assertEquals(DATE_3, oldObj.getStatsData().getDateStats().getHighValue());
}
use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.
the class DateColumnStatsMergerTest method testMergeNulls.
@Test
public void testMergeNulls() {
ColumnStatisticsObj oldObj = new ColumnStatisticsObj();
createData(oldObj, null, null);
ColumnStatisticsObj newObj;
newObj = new ColumnStatisticsObj();
createData(newObj, null, null);
merger.merge(oldObj, newObj);
Assert.assertEquals(null, oldObj.getStatsData().getDateStats().getLowValue());
Assert.assertEquals(null, oldObj.getStatsData().getDateStats().getHighValue());
newObj = new ColumnStatisticsObj();
createData(newObj, DATE_1, DATE_3);
merger.merge(oldObj, newObj);
newObj = new ColumnStatisticsObj();
createData(newObj, null, null);
merger.merge(oldObj, newObj);
Assert.assertEquals(DATE_1, oldObj.getStatsData().getDateStats().getLowValue());
Assert.assertEquals(DATE_3, oldObj.getStatsData().getDateStats().getHighValue());
}
use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.
the class MetaStoreDirectSql method makeColumnStats.
private ColumnStatistics makeColumnStats(List<Object[]> list, ColumnStatisticsDesc csd, int offset, String engine) throws MetaException {
ColumnStatistics result = new ColumnStatistics();
result.setStatsDesc(csd);
List<ColumnStatisticsObj> csos = new ArrayList<ColumnStatisticsObj>(list.size());
for (Object[] row : list) {
// LastAnalyzed is stored per column but thrift has it per several;
// get the lowest for now as nobody actually uses this field.
Object laObj = row[offset + 15];
if (laObj != null && (!csd.isSetLastAnalyzed() || csd.getLastAnalyzed() > MetastoreDirectSqlUtils.extractSqlLong(laObj))) {
csd.setLastAnalyzed(MetastoreDirectSqlUtils.extractSqlLong(laObj));
}
csos.add(prepareCSObj(row, offset));
Deadline.checkTimeout();
}
result.setStatsObj(csos);
result.setEngine(engine);
return result;
}
use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.
the class MetaStoreDirectSql method aggrColStatsForPartitions.
public AggrStats aggrColStatsForPartitions(String catName, String dbName, String tableName, List<String> partNames, List<String> colNames, String engine, boolean useDensityFunctionForNDVEstimation, double ndvTuner, boolean enableBitVector) throws MetaException {
if (colNames.isEmpty() || partNames.isEmpty()) {
LOG.debug("Columns is empty or partNames is empty : Short-circuiting stats eval");
// Nothing to aggregate
return new AggrStats(Collections.<ColumnStatisticsObj>emptyList(), 0);
}
long partsFound = 0;
List<ColumnStatisticsObj> colStatsList;
// Try to read from the cache first
if (isAggregateStatsCacheEnabled && (partNames.size() < aggrStatsCache.getMaxPartsPerCacheNode())) {
AggrColStats colStatsAggrCached;
List<ColumnStatisticsObj> colStatsAggrFromDB;
int maxPartsPerCacheNode = aggrStatsCache.getMaxPartsPerCacheNode();
double fpp = aggrStatsCache.getFalsePositiveProbability();
colStatsList = new ArrayList<ColumnStatisticsObj>();
// Bloom filter for the new node that we will eventually add to the cache
BloomFilter bloomFilter = createPartsBloomFilter(maxPartsPerCacheNode, fpp, partNames);
boolean computePartsFound = true;
for (String colName : colNames) {
// Check the cache first
colStatsAggrCached = aggrStatsCache.get(catName, dbName, tableName, colName, partNames);
if (colStatsAggrCached != null) {
colStatsList.add(colStatsAggrCached.getColStats());
partsFound = colStatsAggrCached.getNumPartsCached();
} else {
if (computePartsFound) {
partsFound = partsFoundForPartitions(catName, dbName, tableName, partNames, colNames, engine);
computePartsFound = false;
}
List<String> colNamesForDB = new ArrayList<>();
colNamesForDB.add(colName);
// Read aggregated stats for one column
colStatsAggrFromDB = columnStatisticsObjForPartitions(catName, dbName, tableName, partNames, colNamesForDB, engine, partsFound, useDensityFunctionForNDVEstimation, ndvTuner, enableBitVector);
if (!colStatsAggrFromDB.isEmpty()) {
ColumnStatisticsObj colStatsAggr = colStatsAggrFromDB.get(0);
colStatsList.add(colStatsAggr);
// Update the cache to add this new aggregate node
aggrStatsCache.add(catName, dbName, tableName, colName, partsFound, colStatsAggr, bloomFilter);
}
}
}
} else {
partsFound = partsFoundForPartitions(catName, dbName, tableName, partNames, colNames, engine);
colStatsList = columnStatisticsObjForPartitions(catName, dbName, tableName, partNames, colNames, engine, partsFound, useDensityFunctionForNDVEstimation, ndvTuner, enableBitVector);
}
LOG.debug("useDensityFunctionForNDVEstimation = " + useDensityFunctionForNDVEstimation + "\npartsFound = " + partsFound + "\nColumnStatisticsObj = " + Arrays.toString(colStatsList.toArray()));
return new AggrStats(colStatsList, partsFound);
}
use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.
the class MetaStoreDirectSql method aggrStatsUseDB.
private List<ColumnStatisticsObj> aggrStatsUseDB(String catName, String dbName, String tableName, List<String> partNames, List<String> colNames, String engine, boolean areAllPartsFound, boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException {
// TODO: all the extrapolation logic should be moved out of this class,
// only mechanical data retrieval should remain here.
String commonPrefix = "select \"COLUMN_NAME\", \"COLUMN_TYPE\", " + "min(\"LONG_LOW_VALUE\"), max(\"LONG_HIGH_VALUE\"), min(\"DOUBLE_LOW_VALUE\"), max(\"DOUBLE_HIGH_VALUE\"), " + "min(cast(\"BIG_DECIMAL_LOW_VALUE\" as decimal)), max(cast(\"BIG_DECIMAL_HIGH_VALUE\" as decimal)), " + "sum(\"NUM_NULLS\"), max(\"NUM_DISTINCTS\"), " + "max(\"AVG_COL_LEN\"), max(\"MAX_COL_LEN\"), sum(\"NUM_TRUES\"), sum(\"NUM_FALSES\"), " + // and LowerBound (calculated by "max(\"NUM_DISTINCTS\")")
"avg((\"LONG_HIGH_VALUE\"-\"LONG_LOW_VALUE\")/cast(\"NUM_DISTINCTS\" as decimal))," + "avg((\"DOUBLE_HIGH_VALUE\"-\"DOUBLE_LOW_VALUE\")/\"NUM_DISTINCTS\")," + "avg((cast(\"BIG_DECIMAL_HIGH_VALUE\" as decimal)-cast(\"BIG_DECIMAL_LOW_VALUE\" as decimal))/\"NUM_DISTINCTS\")," + "sum(\"NUM_DISTINCTS\")" + " from " + PART_COL_STATS + "" + " where \"CAT_NAME\" = ? and \"DB_NAME\" = ? and \"TABLE_NAME\" = ? ";
String queryText = null;
long start = 0;
long end = 0;
boolean doTrace = LOG.isDebugEnabled();
ForwardQueryResult<?> fqr = null;
// Extrapolation is not needed.
if (areAllPartsFound) {
queryText = commonPrefix + " and \"COLUMN_NAME\" in (" + makeParams(colNames.size()) + ")" + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" + " and \"ENGINE\" = ? " + " group by \"COLUMN_NAME\", \"COLUMN_TYPE\"";
start = doTrace ? System.nanoTime() : 0;
try (QueryWrapper query = new QueryWrapper(pm.newQuery("javax.jdo.query.SQL", queryText))) {
Object qResult = executeWithArray(query, prepareParams(catName, dbName, tableName, partNames, colNames, engine), queryText);
if (qResult == null) {
return Collections.emptyList();
}
end = doTrace ? System.nanoTime() : 0;
MetastoreDirectSqlUtils.timingTrace(doTrace, queryText, start, end);
List<Object[]> list = MetastoreDirectSqlUtils.ensureList(qResult);
List<ColumnStatisticsObj> colStats = new ArrayList<ColumnStatisticsObj>(list.size());
for (Object[] row : list) {
colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation, ndvTuner));
Deadline.checkTimeout();
}
return colStats;
}
} else {
// Extrapolation is needed for some columns.
// In this case, at least a column status for a partition is missing.
// We need to extrapolate this partition based on the other partitions
List<ColumnStatisticsObj> colStats = new ArrayList<ColumnStatisticsObj>(colNames.size());
queryText = "select \"COLUMN_NAME\", \"COLUMN_TYPE\", count(\"PARTITION_NAME\") " + " from " + PART_COL_STATS + " where \"CAT_NAME\" = ? and \"DB_NAME\" = ? and \"TABLE_NAME\" = ? " + " and \"COLUMN_NAME\" in (" + makeParams(colNames.size()) + ")" + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" + " and \"ENGINE\" = ? " + " group by \"COLUMN_NAME\", \"COLUMN_TYPE\"";
start = doTrace ? System.nanoTime() : 0;
List<String> noExtraColumnNames = new ArrayList<String>();
Map<String, String[]> extraColumnNameTypeParts = new HashMap<String, String[]>();
try (QueryWrapper query = new QueryWrapper(pm.newQuery("javax.jdo.query.SQL", queryText))) {
Object qResult = executeWithArray(query, prepareParams(catName, dbName, tableName, partNames, colNames, engine), queryText);
end = doTrace ? System.nanoTime() : 0;
MetastoreDirectSqlUtils.timingTrace(doTrace, queryText, start, end);
if (qResult == null) {
return Collections.emptyList();
}
List<Object[]> list = MetastoreDirectSqlUtils.ensureList(qResult);
for (Object[] row : list) {
String colName = (String) row[0];
String colType = (String) row[1];
// Extrapolation is not needed for this column if
// count(\"PARTITION_NAME\")==partNames.size()
// Or, extrapolation is not possible for this column if
// count(\"PARTITION_NAME\")<2
Long count = MetastoreDirectSqlUtils.extractSqlLong(row[2]);
if (count == partNames.size() || count < 2) {
noExtraColumnNames.add(colName);
} else {
extraColumnNameTypeParts.put(colName, new String[] { colType, String.valueOf(count) });
}
Deadline.checkTimeout();
}
}
// Extrapolation is not needed for columns noExtraColumnNames
List<Object[]> list;
if (noExtraColumnNames.size() != 0) {
queryText = commonPrefix + " and \"COLUMN_NAME\" in (" + makeParams(noExtraColumnNames.size()) + ")" + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" + " and \"ENGINE\" = ? " + " group by \"COLUMN_NAME\", \"COLUMN_TYPE\"";
start = doTrace ? System.nanoTime() : 0;
try (QueryWrapper query = new QueryWrapper(pm.newQuery("javax.jdo.query.SQL", queryText))) {
Object qResult = executeWithArray(query, prepareParams(catName, dbName, tableName, partNames, noExtraColumnNames, engine), queryText);
if (qResult == null) {
return Collections.emptyList();
}
list = MetastoreDirectSqlUtils.ensureList(qResult);
for (Object[] row : list) {
colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation, ndvTuner));
Deadline.checkTimeout();
}
end = doTrace ? System.nanoTime() : 0;
MetastoreDirectSqlUtils.timingTrace(doTrace, queryText, start, end);
}
}
// give a sequence number for all the partitions
if (extraColumnNameTypeParts.size() != 0) {
Map<String, Integer> indexMap = new HashMap<String, Integer>();
for (int index = 0; index < partNames.size(); index++) {
indexMap.put(partNames.get(index), index);
}
// get sum for all columns to reduce the number of queries
Map<String, Map<Integer, Object>> sumMap = new HashMap<String, Map<Integer, Object>>();
queryText = "select \"COLUMN_NAME\", sum(\"NUM_NULLS\"), sum(\"NUM_TRUES\"), sum(\"NUM_FALSES\"), sum(\"NUM_DISTINCTS\")" + " from " + PART_COL_STATS + " where \"CAT_NAME\" = ? and \"DB_NAME\" = ? and \"TABLE_NAME\" = ? " + " and \"COLUMN_NAME\" in (" + makeParams(extraColumnNameTypeParts.size()) + ")" + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" + " and \"ENGINE\" = ? " + " group by \"COLUMN_NAME\"";
start = doTrace ? System.nanoTime() : 0;
try (QueryWrapper query = new QueryWrapper(pm.newQuery("javax.jdo.query.SQL", queryText))) {
List<String> extraColumnNames = new ArrayList<String>();
extraColumnNames.addAll(extraColumnNameTypeParts.keySet());
Object qResult = executeWithArray(query, prepareParams(catName, dbName, tableName, partNames, extraColumnNames, engine), queryText);
if (qResult == null) {
return Collections.emptyList();
}
list = MetastoreDirectSqlUtils.ensureList(qResult);
// see the indexes for colstats in IExtrapolatePartStatus
Integer[] sumIndex = new Integer[] { 6, 10, 11, 15 };
for (Object[] row : list) {
Map<Integer, Object> indexToObject = new HashMap<Integer, Object>();
for (int ind = 1; ind < row.length; ind++) {
indexToObject.put(sumIndex[ind - 1], row[ind]);
}
// row[0] is the column name
sumMap.put((String) row[0], indexToObject);
Deadline.checkTimeout();
}
end = doTrace ? System.nanoTime() : 0;
MetastoreDirectSqlUtils.timingTrace(doTrace, queryText, start, end);
}
for (Map.Entry<String, String[]> entry : extraColumnNameTypeParts.entrySet()) {
Object[] row = new Object[IExtrapolatePartStatus.colStatNames.length + 2];
String colName = entry.getKey();
String colType = entry.getValue()[0];
Long sumVal = Long.parseLong(entry.getValue()[1]);
// fill in colname
row[0] = colName;
// fill in coltype
row[1] = colType;
// use linear extrapolation. more complicated one can be added in the
// future.
IExtrapolatePartStatus extrapolateMethod = new LinearExtrapolatePartStatus();
// fill in colstatus
Integer[] index = null;
boolean decimal = false;
if (colType.toLowerCase().startsWith("decimal")) {
index = IExtrapolatePartStatus.indexMaps.get("decimal");
decimal = true;
} else {
index = IExtrapolatePartStatus.indexMaps.get(colType.toLowerCase());
}
// all index.
if (index == null) {
index = IExtrapolatePartStatus.indexMaps.get("default");
}
for (int colStatIndex : index) {
String colStatName = IExtrapolatePartStatus.colStatNames[colStatIndex];
// if the aggregation type is sum, we do a scale-up
if (IExtrapolatePartStatus.aggrTypes[colStatIndex] == IExtrapolatePartStatus.AggrType.Sum) {
Object o = sumMap.get(colName).get(colStatIndex);
if (o == null) {
row[2 + colStatIndex] = null;
} else {
Long val = MetastoreDirectSqlUtils.extractSqlLong(o);
row[2 + colStatIndex] = val / sumVal * (partNames.size());
}
} else if (IExtrapolatePartStatus.aggrTypes[colStatIndex] == IExtrapolatePartStatus.AggrType.Min || IExtrapolatePartStatus.aggrTypes[colStatIndex] == IExtrapolatePartStatus.AggrType.Max) {
// left/right borders
if (!decimal) {
queryText = "select \"" + colStatName + "\",\"PARTITION_NAME\" from " + PART_COL_STATS + " where \"CAT_NAME\" = ? and \"DB_NAME\" = ? and \"TABLE_NAME\" = ?" + " and \"COLUMN_NAME\" = ?" + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" + " and \"ENGINE\" = ? " + " order by \"" + colStatName + "\"";
} else {
queryText = "select \"" + colStatName + "\",\"PARTITION_NAME\" from " + PART_COL_STATS + " where \"CAT_NAME\" = ? and \"DB_NAME\" = ? and \"TABLE_NAME\" = ?" + " and \"COLUMN_NAME\" = ?" + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" + " and \"ENGINE\" = ? " + " order by cast(\"" + colStatName + "\" as decimal)";
}
start = doTrace ? System.nanoTime() : 0;
try (QueryWrapper query = new QueryWrapper(pm.newQuery("javax.jdo.query.SQL", queryText))) {
Object qResult = executeWithArray(query, prepareParams(catName, dbName, tableName, partNames, Arrays.asList(colName), engine), queryText);
if (qResult == null) {
return Collections.emptyList();
}
fqr = (ForwardQueryResult<?>) qResult;
Object[] min = (Object[]) (fqr.get(0));
Object[] max = (Object[]) (fqr.get(fqr.size() - 1));
end = doTrace ? System.nanoTime() : 0;
MetastoreDirectSqlUtils.timingTrace(doTrace, queryText, start, end);
if (min[0] == null || max[0] == null) {
row[2 + colStatIndex] = null;
} else {
row[2 + colStatIndex] = extrapolateMethod.extrapolate(min, max, colStatIndex, indexMap);
}
}
} else {
// if the aggregation type is avg, we use the average on the existing ones.
queryText = "select " + "avg((\"LONG_HIGH_VALUE\"-\"LONG_LOW_VALUE\")/cast(\"NUM_DISTINCTS\" as decimal))," + "avg((\"DOUBLE_HIGH_VALUE\"-\"DOUBLE_LOW_VALUE\")/\"NUM_DISTINCTS\")," + "avg((cast(\"BIG_DECIMAL_HIGH_VALUE\" as decimal)-cast(\"BIG_DECIMAL_LOW_VALUE\" as decimal))/\"NUM_DISTINCTS\")" + " from " + PART_COL_STATS + "" + " where \"CAT_NAME\" = ? and \"DB_NAME\" = ? and \"TABLE_NAME\" = ?" + " and \"COLUMN_NAME\" = ?" + " and \"PARTITION_NAME\" in (" + makeParams(partNames.size()) + ")" + " and \"ENGINE\" = ? " + " group by \"COLUMN_NAME\"";
start = doTrace ? System.nanoTime() : 0;
try (QueryWrapper query = new QueryWrapper(pm.newQuery("javax.jdo.query.SQL", queryText))) {
Object qResult = executeWithArray(query, prepareParams(catName, dbName, tableName, partNames, Arrays.asList(colName), engine), queryText);
if (qResult == null) {
return Collections.emptyList();
}
fqr = (ForwardQueryResult<?>) qResult;
Object[] avg = (Object[]) (fqr.get(0));
// colStatIndex=12,13,14 respond to "AVG_LONG", "AVG_DOUBLE",
// "AVG_DECIMAL"
row[2 + colStatIndex] = avg[colStatIndex - 12];
end = doTrace ? System.nanoTime() : 0;
MetastoreDirectSqlUtils.timingTrace(doTrace, queryText, start, end);
}
}
}
colStats.add(prepareCSObjWithAdjustedNDV(row, 0, useDensityFunctionForNDVEstimation, ndvTuner));
Deadline.checkTimeout();
}
}
return colStats;
}
}
Aggregations