use of org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector in project hive by apache.
the class ColumnStatsUpdateTask method constructColumnStatsFromInput.
private ColumnStatistics constructColumnStatsFromInput() throws SemanticException, MetaException {
// If we are replicating the stats, we don't need to construct those again.
if (work.getColStats() != null) {
ColumnStatistics colStats = work.getColStats();
LOG.debug("Got stats through replication for " + colStats.getStatsDesc().getDbName() + "." + colStats.getStatsDesc().getTableName());
return colStats;
}
String dbName = work.dbName();
String tableName = work.getTableName();
String partName = work.getPartName();
String colName = work.getColName();
String columnType = work.getColType();
ColumnStatisticsObj statsObj = new ColumnStatisticsObj();
// grammar prohibits more than 1 column so we are guaranteed to have only 1
// element in this lists.
statsObj.setColName(colName);
statsObj.setColType(columnType);
ColumnStatisticsData statsData = new ColumnStatisticsData();
if (columnType.equalsIgnoreCase("long") || columnType.equalsIgnoreCase("tinyint") || columnType.equalsIgnoreCase("smallint") || columnType.equalsIgnoreCase("int") || columnType.equalsIgnoreCase("bigint")) {
LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector();
longStats.setNumNullsIsSet(false);
longStats.setNumDVsIsSet(false);
longStats.setLowValueIsSet(false);
longStats.setHighValueIsSet(false);
Map<String, String> mapProp = work.getMapProp();
for (Entry<String, String> entry : mapProp.entrySet()) {
String fName = entry.getKey();
String value = entry.getValue();
if (fName.equals("numNulls")) {
longStats.setNumNulls(Long.parseLong(value));
} else if (fName.equals("numDVs")) {
longStats.setNumDVs(Long.parseLong(value));
} else if (fName.equals("lowValue")) {
longStats.setLowValue(Long.parseLong(value));
} else if (fName.equals("highValue")) {
longStats.setHighValue(Long.parseLong(value));
} else {
throw new SemanticException("Unknown stat");
}
}
statsData.setLongStats(longStats);
statsObj.setStatsData(statsData);
} else if (columnType.equalsIgnoreCase("double") || columnType.equalsIgnoreCase("float")) {
DoubleColumnStatsDataInspector doubleStats = new DoubleColumnStatsDataInspector();
doubleStats.setNumNullsIsSet(false);
doubleStats.setNumDVsIsSet(false);
doubleStats.setLowValueIsSet(false);
doubleStats.setHighValueIsSet(false);
Map<String, String> mapProp = work.getMapProp();
for (Entry<String, String> entry : mapProp.entrySet()) {
String fName = entry.getKey();
String value = entry.getValue();
if (fName.equals("numNulls")) {
doubleStats.setNumNulls(Long.parseLong(value));
} else if (fName.equals("numDVs")) {
doubleStats.setNumDVs(Long.parseLong(value));
} else if (fName.equals("lowValue")) {
doubleStats.setLowValue(Double.parseDouble(value));
} else if (fName.equals("highValue")) {
doubleStats.setHighValue(Double.parseDouble(value));
} else {
throw new SemanticException("Unknown stat");
}
}
statsData.setDoubleStats(doubleStats);
statsObj.setStatsData(statsData);
} else if (columnType.equalsIgnoreCase("string") || columnType.toLowerCase().startsWith("char") || columnType.toLowerCase().startsWith("varchar")) {
// char(x),varchar(x) types
StringColumnStatsDataInspector stringStats = new StringColumnStatsDataInspector();
stringStats.setMaxColLenIsSet(false);
stringStats.setAvgColLenIsSet(false);
stringStats.setNumNullsIsSet(false);
stringStats.setNumDVsIsSet(false);
Map<String, String> mapProp = work.getMapProp();
for (Entry<String, String> entry : mapProp.entrySet()) {
String fName = entry.getKey();
String value = entry.getValue();
if (fName.equals("numNulls")) {
stringStats.setNumNulls(Long.parseLong(value));
} else if (fName.equals("numDVs")) {
stringStats.setNumDVs(Long.parseLong(value));
} else if (fName.equals("avgColLen")) {
stringStats.setAvgColLen(Double.parseDouble(value));
} else if (fName.equals("maxColLen")) {
stringStats.setMaxColLen(Long.parseLong(value));
} else {
throw new SemanticException("Unknown stat");
}
}
statsData.setStringStats(stringStats);
statsObj.setStatsData(statsData);
} else if (columnType.equalsIgnoreCase("boolean")) {
BooleanColumnStatsData booleanStats = new BooleanColumnStatsData();
booleanStats.setNumNullsIsSet(false);
booleanStats.setNumTruesIsSet(false);
booleanStats.setNumFalsesIsSet(false);
Map<String, String> mapProp = work.getMapProp();
for (Entry<String, String> entry : mapProp.entrySet()) {
String fName = entry.getKey();
String value = entry.getValue();
if (fName.equals("numNulls")) {
booleanStats.setNumNulls(Long.parseLong(value));
} else if (fName.equals("numTrues")) {
booleanStats.setNumTrues(Long.parseLong(value));
} else if (fName.equals("numFalses")) {
booleanStats.setNumFalses(Long.parseLong(value));
} else {
throw new SemanticException("Unknown stat");
}
}
statsData.setBooleanStats(booleanStats);
statsObj.setStatsData(statsData);
} else if (columnType.equalsIgnoreCase("binary")) {
BinaryColumnStatsData binaryStats = new BinaryColumnStatsData();
binaryStats.setNumNullsIsSet(false);
binaryStats.setAvgColLenIsSet(false);
binaryStats.setMaxColLenIsSet(false);
Map<String, String> mapProp = work.getMapProp();
for (Entry<String, String> entry : mapProp.entrySet()) {
String fName = entry.getKey();
String value = entry.getValue();
if (fName.equals("numNulls")) {
binaryStats.setNumNulls(Long.parseLong(value));
} else if (fName.equals("avgColLen")) {
binaryStats.setAvgColLen(Double.parseDouble(value));
} else if (fName.equals("maxColLen")) {
binaryStats.setMaxColLen(Long.parseLong(value));
} else {
throw new SemanticException("Unknown stat");
}
}
statsData.setBinaryStats(binaryStats);
statsObj.setStatsData(statsData);
} else if (columnType.toLowerCase().startsWith("decimal")) {
// decimal(a,b) type
DecimalColumnStatsDataInspector decimalStats = new DecimalColumnStatsDataInspector();
decimalStats.setNumNullsIsSet(false);
decimalStats.setNumDVsIsSet(false);
decimalStats.setLowValueIsSet(false);
decimalStats.setHighValueIsSet(false);
Map<String, String> mapProp = work.getMapProp();
for (Entry<String, String> entry : mapProp.entrySet()) {
String fName = entry.getKey();
String value = entry.getValue();
if (fName.equals("numNulls")) {
decimalStats.setNumNulls(Long.parseLong(value));
} else if (fName.equals("numDVs")) {
decimalStats.setNumDVs(Long.parseLong(value));
} else if (fName.equals("lowValue")) {
BigDecimal d = new BigDecimal(value);
decimalStats.setLowValue(DecimalUtils.getDecimal(ByteBuffer.wrap(d.unscaledValue().toByteArray()), (short) d.scale()));
} else if (fName.equals("highValue")) {
BigDecimal d = new BigDecimal(value);
decimalStats.setHighValue(DecimalUtils.getDecimal(ByteBuffer.wrap(d.unscaledValue().toByteArray()), (short) d.scale()));
} else {
throw new SemanticException("Unknown stat");
}
}
statsData.setDecimalStats(decimalStats);
statsObj.setStatsData(statsData);
} else if (columnType.equalsIgnoreCase("date")) {
DateColumnStatsDataInspector dateStats = new DateColumnStatsDataInspector();
Map<String, String> mapProp = work.getMapProp();
for (Entry<String, String> entry : mapProp.entrySet()) {
String fName = entry.getKey();
String value = entry.getValue();
if (fName.equals("numNulls")) {
dateStats.setNumNulls(Long.parseLong(value));
} else if (fName.equals("numDVs")) {
dateStats.setNumDVs(Long.parseLong(value));
} else if (fName.equals("lowValue")) {
// Date high/low value is stored as long in stats DB, but allow users to set high/low
// value using either date format (yyyy-mm-dd) or numeric format (days since epoch)
dateStats.setLowValue(readDateValue(value));
} else if (fName.equals("highValue")) {
dateStats.setHighValue(readDateValue(value));
} else {
throw new SemanticException("Unknown stat");
}
}
statsData.setDateStats(dateStats);
statsObj.setStatsData(statsData);
} else if (columnType.equalsIgnoreCase("timestamp")) {
TimestampColumnStatsDataInspector timestampStats = new TimestampColumnStatsDataInspector();
Map<String, String> mapProp = work.getMapProp();
for (Entry<String, String> entry : mapProp.entrySet()) {
String fName = entry.getKey();
String value = entry.getValue();
if (fName.equals("numNulls")) {
timestampStats.setNumNulls(Long.parseLong(value));
} else if (fName.equals("numDVs")) {
timestampStats.setNumDVs(Long.parseLong(value));
} else if (fName.equals("lowValue")) {
timestampStats.setLowValue(readTimestampValue(value));
} else if (fName.equals("highValue")) {
timestampStats.setHighValue(readTimestampValue(value));
} else {
throw new SemanticException("Unknown stat");
}
}
statsData.setTimestampStats(timestampStats);
statsObj.setStatsData(statsData);
} else {
throw new SemanticException("Unsupported type");
}
ColumnStatisticsDesc statsDesc = getColumnStatsDesc(dbName, tableName, partName, partName == null);
ColumnStatistics colStat = new ColumnStatistics();
colStat.setStatsDesc(statsDesc);
colStat.addToStatsObj(statsObj);
colStat.setEngine(Constants.HIVE_ENGINE);
return colStat;
}
use of org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector in project hive by apache.
the class StatObjectConverter method getTableColumnStatisticsObj.
public static ColumnStatisticsObj getTableColumnStatisticsObj(MTableColumnStatistics mStatsObj, boolean enableBitVector) {
ColumnStatisticsObj statsObj = new ColumnStatisticsObj();
statsObj.setColType(mStatsObj.getColType());
statsObj.setColName(mStatsObj.getColName());
String colType = mStatsObj.getColType().toLowerCase();
ColumnStatisticsData colStatsData = new ColumnStatisticsData();
if (colType.equals("boolean")) {
BooleanColumnStatsData boolStats = new BooleanColumnStatsData();
boolStats.setNumFalses(mStatsObj.getNumFalses());
boolStats.setNumTrues(mStatsObj.getNumTrues());
boolStats.setNumNulls(mStatsObj.getNumNulls());
colStatsData.setBooleanStats(boolStats);
} else if (colType.equals("string") || colType.startsWith("varchar") || colType.startsWith("char")) {
StringColumnStatsDataInspector stringStats = new StringColumnStatsDataInspector();
stringStats.setNumNulls(mStatsObj.getNumNulls());
stringStats.setAvgColLen(mStatsObj.getAvgColLen());
stringStats.setMaxColLen(mStatsObj.getMaxColLen());
stringStats.setNumDVs(mStatsObj.getNumDVs());
stringStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
colStatsData.setStringStats(stringStats);
} else if (colType.equals("binary")) {
BinaryColumnStatsData binaryStats = new BinaryColumnStatsData();
binaryStats.setNumNulls(mStatsObj.getNumNulls());
binaryStats.setAvgColLen(mStatsObj.getAvgColLen());
binaryStats.setMaxColLen(mStatsObj.getMaxColLen());
colStatsData.setBinaryStats(binaryStats);
} else if (colType.equals("bigint") || colType.equals("int") || colType.equals("smallint") || colType.equals("tinyint")) {
LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector();
longStats.setNumNulls(mStatsObj.getNumNulls());
Long longHighValue = mStatsObj.getLongHighValue();
if (longHighValue != null) {
longStats.setHighValue(longHighValue);
}
Long longLowValue = mStatsObj.getLongLowValue();
if (longLowValue != null) {
longStats.setLowValue(longLowValue);
}
longStats.setNumDVs(mStatsObj.getNumDVs());
longStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
colStatsData.setLongStats(longStats);
} else if (colType.equals("double") || colType.equals("float")) {
DoubleColumnStatsDataInspector doubleStats = new DoubleColumnStatsDataInspector();
doubleStats.setNumNulls(mStatsObj.getNumNulls());
Double doubleHighValue = mStatsObj.getDoubleHighValue();
if (doubleHighValue != null) {
doubleStats.setHighValue(doubleHighValue);
}
Double doubleLowValue = mStatsObj.getDoubleLowValue();
if (doubleLowValue != null) {
doubleStats.setLowValue(doubleLowValue);
}
doubleStats.setNumDVs(mStatsObj.getNumDVs());
doubleStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
colStatsData.setDoubleStats(doubleStats);
} else if (colType.startsWith("decimal")) {
DecimalColumnStatsDataInspector decimalStats = new DecimalColumnStatsDataInspector();
decimalStats.setNumNulls(mStatsObj.getNumNulls());
String decimalHighValue = mStatsObj.getDecimalHighValue();
if (decimalHighValue != null) {
decimalStats.setHighValue(DecimalUtils.createThriftDecimal(decimalHighValue));
}
String decimalLowValue = mStatsObj.getDecimalLowValue();
if (decimalLowValue != null) {
decimalStats.setLowValue(DecimalUtils.createThriftDecimal(decimalLowValue));
}
decimalStats.setNumDVs(mStatsObj.getNumDVs());
decimalStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
colStatsData.setDecimalStats(decimalStats);
} else if (colType.equals("date")) {
DateColumnStatsDataInspector dateStats = new DateColumnStatsDataInspector();
dateStats.setNumNulls(mStatsObj.getNumNulls());
Long highValue = mStatsObj.getLongHighValue();
if (highValue != null) {
dateStats.setHighValue(new Date(highValue));
}
Long lowValue = mStatsObj.getLongLowValue();
if (lowValue != null) {
dateStats.setLowValue(new Date(lowValue));
}
dateStats.setNumDVs(mStatsObj.getNumDVs());
dateStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
colStatsData.setDateStats(dateStats);
} else if (colType.equals("timestamp")) {
TimestampColumnStatsDataInspector timestampStats = new TimestampColumnStatsDataInspector();
timestampStats.setNumNulls(mStatsObj.getNumNulls());
Long highValue = mStatsObj.getLongHighValue();
if (highValue != null) {
timestampStats.setHighValue(new Timestamp(highValue));
}
Long lowValue = mStatsObj.getLongLowValue();
if (lowValue != null) {
timestampStats.setLowValue(new Timestamp(lowValue));
}
timestampStats.setNumDVs(mStatsObj.getNumDVs());
timestampStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
colStatsData.setTimestampStats(timestampStats);
}
statsObj.setStatsData(colStatsData);
return statsObj;
}
use of org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector in project hive by apache.
the class StatObjectConverter method fillColumnStatisticsData.
// DB
public static void fillColumnStatisticsData(String colType, ColumnStatisticsData data, Object llow, Object lhigh, Object dlow, Object dhigh, Object declow, Object dechigh, Object nulls, Object dist, Object avglen, Object maxlen, Object trues, Object falses, Object avgLong, Object avgDouble, Object avgDecimal, Object sumDist, boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException {
colType = colType.toLowerCase();
if (colType.equals("boolean")) {
BooleanColumnStatsData boolStats = new BooleanColumnStatsData();
boolStats.setNumFalses(MetastoreDirectSqlUtils.extractSqlLong(falses));
boolStats.setNumTrues(MetastoreDirectSqlUtils.extractSqlLong(trues));
boolStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
data.setBooleanStats(boolStats);
} else if (colType.equals("string") || colType.startsWith("varchar") || colType.startsWith("char")) {
StringColumnStatsDataInspector stringStats = new StringColumnStatsDataInspector();
stringStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
stringStats.setAvgColLen(MetastoreDirectSqlUtils.extractSqlDouble(avglen));
stringStats.setMaxColLen(MetastoreDirectSqlUtils.extractSqlLong(maxlen));
stringStats.setNumDVs(MetastoreDirectSqlUtils.extractSqlLong(dist));
data.setStringStats(stringStats);
} else if (colType.equals("binary")) {
BinaryColumnStatsData binaryStats = new BinaryColumnStatsData();
binaryStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
binaryStats.setAvgColLen(MetastoreDirectSqlUtils.extractSqlDouble(avglen));
binaryStats.setMaxColLen(MetastoreDirectSqlUtils.extractSqlLong(maxlen));
data.setBinaryStats(binaryStats);
} else if (colType.equals("bigint") || colType.equals("int") || colType.equals("smallint") || colType.equals("tinyint")) {
LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector();
longStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
if (lhigh != null) {
longStats.setHighValue(MetastoreDirectSqlUtils.extractSqlLong(lhigh));
}
if (llow != null) {
longStats.setLowValue(MetastoreDirectSqlUtils.extractSqlLong(llow));
}
long lowerBound = MetastoreDirectSqlUtils.extractSqlLong(dist);
long higherBound = MetastoreDirectSqlUtils.extractSqlLong(sumDist);
long rangeBound = Long.MAX_VALUE;
if (lhigh != null && llow != null) {
rangeBound = MetastoreDirectSqlUtils.extractSqlLong(lhigh) - MetastoreDirectSqlUtils.extractSqlLong(llow) + 1;
}
long estimation;
if (useDensityFunctionForNDVEstimation && lhigh != null && llow != null && avgLong != null && MetastoreDirectSqlUtils.extractSqlDouble(avgLong) != 0.0) {
// We have estimation, lowerbound and higherbound. We use estimation if
// it is between lowerbound and higherbound.
estimation = MetastoreDirectSqlUtils.extractSqlLong((MetastoreDirectSqlUtils.extractSqlLong(lhigh) - MetastoreDirectSqlUtils.extractSqlLong(llow)) / MetastoreDirectSqlUtils.extractSqlDouble(avgLong));
if (estimation < lowerBound) {
estimation = lowerBound;
} else if (estimation > higherBound) {
estimation = higherBound;
}
} else {
estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner);
}
estimation = Math.min(estimation, rangeBound);
longStats.setNumDVs(estimation);
data.setLongStats(longStats);
} else if (colType.equals("date")) {
DateColumnStatsDataInspector dateStats = new DateColumnStatsDataInspector();
dateStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
if (lhigh != null) {
dateStats.setHighValue(new Date(MetastoreDirectSqlUtils.extractSqlLong(lhigh)));
}
if (llow != null) {
dateStats.setLowValue(new Date(MetastoreDirectSqlUtils.extractSqlLong(llow)));
}
long lowerBound = MetastoreDirectSqlUtils.extractSqlLong(dist);
long higherBound = MetastoreDirectSqlUtils.extractSqlLong(sumDist);
long rangeBound = Long.MAX_VALUE;
if (lhigh != null && llow != null) {
rangeBound = MetastoreDirectSqlUtils.extractSqlLong(lhigh) - MetastoreDirectSqlUtils.extractSqlLong(llow) + 1;
}
long estimation;
if (useDensityFunctionForNDVEstimation && lhigh != null && llow != null && avgLong != null && MetastoreDirectSqlUtils.extractSqlDouble(avgLong) != 0.0) {
// We have estimation, lowerbound and higherbound. We use estimation if
// it is between lowerbound and higherbound.
estimation = MetastoreDirectSqlUtils.extractSqlLong((MetastoreDirectSqlUtils.extractSqlLong(lhigh) - MetastoreDirectSqlUtils.extractSqlLong(llow)) / MetastoreDirectSqlUtils.extractSqlDouble(avgLong));
if (estimation < lowerBound) {
estimation = lowerBound;
} else if (estimation > higherBound) {
estimation = higherBound;
}
} else {
estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner);
}
estimation = Math.min(estimation, rangeBound);
dateStats.setNumDVs(estimation);
data.setDateStats(dateStats);
} else if (colType.equals("timestamp")) {
TimestampColumnStatsDataInspector timestampStats = new TimestampColumnStatsDataInspector();
timestampStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
if (lhigh != null) {
timestampStats.setHighValue(new Timestamp(MetastoreDirectSqlUtils.extractSqlLong(lhigh)));
}
if (llow != null) {
timestampStats.setLowValue(new Timestamp(MetastoreDirectSqlUtils.extractSqlLong(llow)));
}
long lowerBound = MetastoreDirectSqlUtils.extractSqlLong(dist);
long higherBound = MetastoreDirectSqlUtils.extractSqlLong(sumDist);
long rangeBound = Long.MAX_VALUE;
if (lhigh != null && llow != null) {
rangeBound = MetastoreDirectSqlUtils.extractSqlLong(lhigh) - MetastoreDirectSqlUtils.extractSqlLong(llow) + 1;
}
long estimation;
if (useDensityFunctionForNDVEstimation && lhigh != null && llow != null && avgLong != null && MetastoreDirectSqlUtils.extractSqlDouble(avgLong) != 0.0) {
// We have estimation, lowerbound and higherbound. We use estimation if
// it is between lowerbound and higherbound.
estimation = MetastoreDirectSqlUtils.extractSqlLong((MetastoreDirectSqlUtils.extractSqlLong(lhigh) - MetastoreDirectSqlUtils.extractSqlLong(llow)) / MetastoreDirectSqlUtils.extractSqlDouble(avgLong));
if (estimation < lowerBound) {
estimation = lowerBound;
} else if (estimation > higherBound) {
estimation = higherBound;
}
} else {
estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner);
}
estimation = Math.min(estimation, rangeBound);
timestampStats.setNumDVs(estimation);
data.setTimestampStats(timestampStats);
} else if (colType.equals("double") || colType.equals("float")) {
DoubleColumnStatsDataInspector doubleStats = new DoubleColumnStatsDataInspector();
doubleStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
if (dhigh != null) {
doubleStats.setHighValue(MetastoreDirectSqlUtils.extractSqlDouble(dhigh));
}
if (dlow != null) {
doubleStats.setLowValue(MetastoreDirectSqlUtils.extractSqlDouble(dlow));
}
long lowerBound = MetastoreDirectSqlUtils.extractSqlLong(dist);
long higherBound = MetastoreDirectSqlUtils.extractSqlLong(sumDist);
if (useDensityFunctionForNDVEstimation && dhigh != null && dlow != null && avgDouble != null && MetastoreDirectSqlUtils.extractSqlDouble(avgDouble) != 0.0) {
long estimation = MetastoreDirectSqlUtils.extractSqlLong((MetastoreDirectSqlUtils.extractSqlLong(dhigh) - MetastoreDirectSqlUtils.extractSqlLong(dlow)) / MetastoreDirectSqlUtils.extractSqlDouble(avgDouble));
if (estimation < lowerBound) {
doubleStats.setNumDVs(lowerBound);
} else if (estimation > higherBound) {
doubleStats.setNumDVs(higherBound);
} else {
doubleStats.setNumDVs(estimation);
}
} else {
doubleStats.setNumDVs((long) (lowerBound + (higherBound - lowerBound) * ndvTuner));
}
data.setDoubleStats(doubleStats);
} else if (colType.startsWith("decimal")) {
DecimalColumnStatsDataInspector decimalStats = new DecimalColumnStatsDataInspector();
decimalStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
Decimal low = null;
Decimal high = null;
BigDecimal blow = null;
BigDecimal bhigh = null;
if (dechigh instanceof BigDecimal) {
bhigh = (BigDecimal) dechigh;
high = DecimalUtils.getDecimal(ByteBuffer.wrap(bhigh.unscaledValue().toByteArray()), (short) bhigh.scale());
} else if (dechigh instanceof String) {
bhigh = new BigDecimal((String) dechigh);
high = DecimalUtils.createThriftDecimal((String) dechigh);
}
decimalStats.setHighValue(high);
if (declow instanceof BigDecimal) {
blow = (BigDecimal) declow;
low = DecimalUtils.getDecimal(ByteBuffer.wrap(blow.unscaledValue().toByteArray()), (short) blow.scale());
} else if (dechigh instanceof String) {
blow = new BigDecimal((String) declow);
low = DecimalUtils.createThriftDecimal((String) declow);
}
decimalStats.setLowValue(low);
long lowerBound = MetastoreDirectSqlUtils.extractSqlLong(dist);
long higherBound = MetastoreDirectSqlUtils.extractSqlLong(sumDist);
if (useDensityFunctionForNDVEstimation && dechigh != null && declow != null && avgDecimal != null && MetastoreDirectSqlUtils.extractSqlDouble(avgDecimal) != 0.0) {
long estimation = MetastoreDirectSqlUtils.extractSqlLong(MetastoreDirectSqlUtils.extractSqlLong(bhigh.subtract(blow).floatValue() / MetastoreDirectSqlUtils.extractSqlDouble(avgDecimal)));
if (estimation < lowerBound) {
decimalStats.setNumDVs(lowerBound);
} else if (estimation > higherBound) {
decimalStats.setNumDVs(higherBound);
} else {
decimalStats.setNumDVs(estimation);
}
} else {
decimalStats.setNumDVs((long) (lowerBound + (higherBound - lowerBound) * ndvTuner));
}
data.setDecimalStats(decimalStats);
}
}
use of org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector in project hive by apache.
the class StatObjectConverter method getPartitionColumnStatisticsObj.
public static ColumnStatisticsObj getPartitionColumnStatisticsObj(MPartitionColumnStatistics mStatsObj, boolean enableBitVector) {
ColumnStatisticsObj statsObj = new ColumnStatisticsObj();
statsObj.setColType(mStatsObj.getColType());
statsObj.setColName(mStatsObj.getColName());
String colType = mStatsObj.getColType().toLowerCase();
ColumnStatisticsData colStatsData = new ColumnStatisticsData();
if (colType.equals("boolean")) {
BooleanColumnStatsData boolStats = new BooleanColumnStatsData();
boolStats.setNumFalses(mStatsObj.getNumFalses());
boolStats.setNumTrues(mStatsObj.getNumTrues());
boolStats.setNumNulls(mStatsObj.getNumNulls());
colStatsData.setBooleanStats(boolStats);
} else if (colType.equals("string") || colType.startsWith("varchar") || colType.startsWith("char")) {
StringColumnStatsDataInspector stringStats = new StringColumnStatsDataInspector();
stringStats.setNumNulls(mStatsObj.getNumNulls());
stringStats.setAvgColLen(mStatsObj.getAvgColLen());
stringStats.setMaxColLen(mStatsObj.getMaxColLen());
stringStats.setNumDVs(mStatsObj.getNumDVs());
stringStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
colStatsData.setStringStats(stringStats);
} else if (colType.equals("binary")) {
BinaryColumnStatsData binaryStats = new BinaryColumnStatsData();
binaryStats.setNumNulls(mStatsObj.getNumNulls());
binaryStats.setAvgColLen(mStatsObj.getAvgColLen());
binaryStats.setMaxColLen(mStatsObj.getMaxColLen());
colStatsData.setBinaryStats(binaryStats);
} else if (colType.equals("tinyint") || colType.equals("smallint") || colType.equals("int") || colType.equals("bigint")) {
LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector();
longStats.setNumNulls(mStatsObj.getNumNulls());
if (mStatsObj.getLongHighValue() != null) {
longStats.setHighValue(mStatsObj.getLongHighValue());
}
if (mStatsObj.getLongLowValue() != null) {
longStats.setLowValue(mStatsObj.getLongLowValue());
}
longStats.setNumDVs(mStatsObj.getNumDVs());
longStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
colStatsData.setLongStats(longStats);
} else if (colType.equals("double") || colType.equals("float")) {
DoubleColumnStatsDataInspector doubleStats = new DoubleColumnStatsDataInspector();
doubleStats.setNumNulls(mStatsObj.getNumNulls());
if (mStatsObj.getDoubleHighValue() != null) {
doubleStats.setHighValue(mStatsObj.getDoubleHighValue());
}
if (mStatsObj.getDoubleLowValue() != null) {
doubleStats.setLowValue(mStatsObj.getDoubleLowValue());
}
doubleStats.setNumDVs(mStatsObj.getNumDVs());
doubleStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
colStatsData.setDoubleStats(doubleStats);
} else if (colType.startsWith("decimal")) {
DecimalColumnStatsDataInspector decimalStats = new DecimalColumnStatsDataInspector();
decimalStats.setNumNulls(mStatsObj.getNumNulls());
if (mStatsObj.getDecimalHighValue() != null) {
decimalStats.setHighValue(DecimalUtils.createThriftDecimal(mStatsObj.getDecimalHighValue()));
}
if (mStatsObj.getDecimalLowValue() != null) {
decimalStats.setLowValue(DecimalUtils.createThriftDecimal(mStatsObj.getDecimalLowValue()));
}
decimalStats.setNumDVs(mStatsObj.getNumDVs());
decimalStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
colStatsData.setDecimalStats(decimalStats);
} else if (colType.equals("date")) {
DateColumnStatsDataInspector dateStats = new DateColumnStatsDataInspector();
dateStats.setNumNulls(mStatsObj.getNumNulls());
Long highValue = mStatsObj.getLongHighValue();
if (highValue != null) {
dateStats.setHighValue(new Date(highValue));
}
Long lowValue = mStatsObj.getLongLowValue();
if (lowValue != null) {
dateStats.setLowValue(new Date(lowValue));
}
dateStats.setNumDVs(mStatsObj.getNumDVs());
dateStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
colStatsData.setDateStats(dateStats);
} else if (colType.equals("timestamp")) {
TimestampColumnStatsDataInspector timestampStats = new TimestampColumnStatsDataInspector();
timestampStats.setNumNulls(mStatsObj.getNumNulls());
Long highValue = mStatsObj.getLongHighValue();
if (highValue != null) {
timestampStats.setHighValue(new Timestamp(highValue));
}
Long lowValue = mStatsObj.getLongLowValue();
if (lowValue != null) {
timestampStats.setLowValue(new Timestamp(lowValue));
}
timestampStats.setNumDVs(mStatsObj.getNumDVs());
timestampStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
colStatsData.setTimestampStats(timestampStats);
}
statsObj.setStatsData(colStatsData);
return statsObj;
}
use of org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector in project hive by apache.
the class DateColumnStatsAggregator method aggregate.
@Override
public ColumnStatisticsObj aggregate(List<ColStatsObjWithSourceInfo> colStatsWithSourceInfo, List<String> partNames, boolean areAllPartsFound) throws MetaException {
ColumnStatisticsObj statsObj = null;
String colType = null;
String colName = null;
// check if all the ColumnStatisticsObjs contain stats and all the ndv are
// bitvectors
boolean doAllPartitionContainStats = partNames.size() == colStatsWithSourceInfo.size();
NumDistinctValueEstimator ndvEstimator = null;
for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
ColumnStatisticsObj cso = csp.getColStatsObj();
if (statsObj == null) {
colName = cso.getColName();
colType = cso.getColType();
statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso.getStatsData().getSetField());
LOG.trace("doAllPartitionContainStats for column: {} is: {}", colName, doAllPartitionContainStats);
}
DateColumnStatsDataInspector dateColumnStats = dateInspectorFromStats(cso);
if (dateColumnStats.getNdvEstimator() == null) {
ndvEstimator = null;
break;
} else {
// check if all of the bit vectors can merge
NumDistinctValueEstimator estimator = dateColumnStats.getNdvEstimator();
if (ndvEstimator == null) {
ndvEstimator = estimator;
} else {
if (ndvEstimator.canMerge(estimator)) {
continue;
} else {
ndvEstimator = null;
break;
}
}
}
}
if (ndvEstimator != null) {
ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator);
}
LOG.debug("all of the bit vectors can merge for " + colName + " is " + (ndvEstimator != null));
ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
if (doAllPartitionContainStats || colStatsWithSourceInfo.size() < 2) {
DateColumnStatsDataInspector aggregateData = null;
long lowerBound = 0;
long higherBound = 0;
double densityAvgSum = 0.0;
for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
ColumnStatisticsObj cso = csp.getColStatsObj();
DateColumnStatsDataInspector newData = dateInspectorFromStats(cso);
higherBound += newData.getNumDVs();
if (newData.isSetLowValue() && newData.isSetHighValue()) {
densityAvgSum += (diff(newData.getHighValue(), newData.getLowValue())) / newData.getNumDVs();
}
if (ndvEstimator != null) {
ndvEstimator.mergeEstimators(newData.getNdvEstimator());
}
if (aggregateData == null) {
aggregateData = newData.deepCopy();
} else {
DateColumnStatsMerger merger = new DateColumnStatsMerger();
merger.setLowValue(aggregateData, newData);
merger.setHighValue(aggregateData, newData);
aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
}
}
if (ndvEstimator != null) {
// if all the ColumnStatisticsObjs contain bitvectors, we do not need to
// use uniform distribution assumption because we can merge bitvectors
// to get a good estimation.
aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
} else {
long estimation;
if (useDensityFunctionForNDVEstimation) {
// We have estimation, lowerbound and higherbound. We use estimation
// if it is between lowerbound and higherbound.
double densityAvg = densityAvgSum / partNames.size();
estimation = (long) (diff(aggregateData.getHighValue(), aggregateData.getLowValue()) / densityAvg);
if (estimation < lowerBound) {
estimation = lowerBound;
} else if (estimation > higherBound) {
estimation = higherBound;
}
} else {
estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner);
}
aggregateData.setNumDVs(estimation);
}
columnStatisticsData.setDateStats(aggregateData);
} else {
// we need extrapolation
LOG.debug("start extrapolation for " + colName);
Map<String, Integer> indexMap = new HashMap<>();
for (int index = 0; index < partNames.size(); index++) {
indexMap.put(partNames.get(index), index);
}
Map<String, Double> adjustedIndexMap = new HashMap<>();
Map<String, ColumnStatisticsData> adjustedStatsMap = new HashMap<>();
// while we scan the css, we also get the densityAvg, lowerbound and
// higerbound when useDensityFunctionForNDVEstimation is true.
double densityAvgSum = 0.0;
if (ndvEstimator == null) {
// the traditional extrapolation methods.
for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
ColumnStatisticsObj cso = csp.getColStatsObj();
String partName = csp.getPartName();
DateColumnStatsData newData = cso.getStatsData().getDateStats();
if (useDensityFunctionForNDVEstimation) {
densityAvgSum += diff(newData.getHighValue(), newData.getLowValue()) / newData.getNumDVs();
}
adjustedIndexMap.put(partName, (double) indexMap.get(partName));
adjustedStatsMap.put(partName, cso.getStatsData());
}
} else {
// we first merge all the adjacent bitvectors that we could merge and
// derive new partition names and index.
StringBuilder pseudoPartName = new StringBuilder();
double pseudoIndexSum = 0;
int length = 0;
int curIndex = -1;
DateColumnStatsDataInspector aggregateData = null;
for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
ColumnStatisticsObj cso = csp.getColStatsObj();
String partName = csp.getPartName();
DateColumnStatsDataInspector newData = dateInspectorFromStats(cso);
// already checked it before.
if (indexMap.get(partName) != curIndex) {
// There is bitvector, but it is not adjacent to the previous ones.
if (length > 0) {
// we have to set ndv
adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length);
aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
ColumnStatisticsData csd = new ColumnStatisticsData();
csd.setDateStats(aggregateData);
adjustedStatsMap.put(pseudoPartName.toString(), csd);
if (useDensityFunctionForNDVEstimation) {
densityAvgSum += diff(aggregateData.getHighValue(), aggregateData.getLowValue()) / aggregateData.getNumDVs();
}
// reset everything
pseudoPartName = new StringBuilder();
pseudoIndexSum = 0;
length = 0;
ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator);
}
aggregateData = null;
}
curIndex = indexMap.get(partName);
pseudoPartName.append(partName);
pseudoIndexSum += curIndex;
length++;
curIndex++;
if (aggregateData == null) {
aggregateData = newData.deepCopy();
} else {
aggregateData.setLowValue(min(aggregateData.getLowValue(), newData.getLowValue()));
aggregateData.setHighValue(max(aggregateData.getHighValue(), newData.getHighValue()));
aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
}
ndvEstimator.mergeEstimators(newData.getNdvEstimator());
}
if (length > 0) {
// we have to set ndv
adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length);
aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
ColumnStatisticsData csd = new ColumnStatisticsData();
csd.setDateStats(aggregateData);
adjustedStatsMap.put(pseudoPartName.toString(), csd);
if (useDensityFunctionForNDVEstimation) {
densityAvgSum += diff(aggregateData.getHighValue(), aggregateData.getLowValue()) / aggregateData.getNumDVs();
}
}
}
extrapolate(columnStatisticsData, partNames.size(), colStatsWithSourceInfo.size(), adjustedIndexMap, adjustedStatsMap, densityAvgSum / adjustedStatsMap.size());
}
LOG.debug("Ndv estimatation for {} is {} # of partitions requested: {} # of partitions found: {}", colName, columnStatisticsData.getDateStats().getNumDVs(), partNames.size(), colStatsWithSourceInfo.size());
statsObj.setStatsData(columnStatisticsData);
return statsObj;
}
Aggregations