Search in sources :

Example 11 with StringColumnStatsDataInspector

use of org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector in project hive by apache.

the class ColumnStatsAggregatorFactory method newColumnStaticsObj.

public static ColumnStatisticsObj newColumnStaticsObj(String colName, String colType, _Fields type) {
    ColumnStatisticsObj cso = new ColumnStatisticsObj();
    ColumnStatisticsData csd = new ColumnStatisticsData();
    cso.setColName(colName);
    cso.setColType(colType);
    switch(type) {
        case BOOLEAN_STATS:
            csd.setBooleanStats(new BooleanColumnStatsData());
            break;
        case LONG_STATS:
            csd.setLongStats(new LongColumnStatsDataInspector());
            break;
        case DATE_STATS:
            csd.setDateStats(new DateColumnStatsDataInspector());
            break;
        case TIMESTAMP_STATS:
            csd.setTimestampStats(new TimestampColumnStatsDataInspector());
            break;
        case DOUBLE_STATS:
            csd.setDoubleStats(new DoubleColumnStatsDataInspector());
            break;
        case STRING_STATS:
            csd.setStringStats(new StringColumnStatsDataInspector());
            break;
        case BINARY_STATS:
            csd.setBinaryStats(new BinaryColumnStatsData());
            break;
        case DECIMAL_STATS:
            csd.setDecimalStats(new DecimalColumnStatsDataInspector());
            break;
        default:
            throw new RuntimeException("Woh, bad.  Unknown stats type!");
    }
    cso.setStatsData(csd);
    return cso;
}
Also used : BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) DecimalColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector) DoubleColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector) LongColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector) DateColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector) TimestampColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.TimestampColumnStatsDataInspector) StringColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)

Example 12 with StringColumnStatsDataInspector

use of org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector in project hive by apache.

the class StringColumnStatsAggregator method aggregate.

@Override
public ColumnStatisticsObj aggregate(List<ColStatsObjWithSourceInfo> colStatsWithSourceInfo, List<String> partNames, boolean areAllPartsFound) throws MetaException {
    ColumnStatisticsObj statsObj = null;
    String colType = null;
    String colName = null;
    // check if all the ColumnStatisticsObjs contain stats and all the ndv are
    // bitvectors
    boolean doAllPartitionContainStats = partNames.size() == colStatsWithSourceInfo.size();
    NumDistinctValueEstimator ndvEstimator = null;
    for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
        ColumnStatisticsObj cso = csp.getColStatsObj();
        if (statsObj == null) {
            colName = cso.getColName();
            colType = cso.getColType();
            statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso.getStatsData().getSetField());
            LOG.trace("doAllPartitionContainStats for column: {} is: {}", colName, doAllPartitionContainStats);
        }
        StringColumnStatsDataInspector stringColumnStatsData = stringInspectorFromStats(cso);
        if (stringColumnStatsData.getNdvEstimator() == null) {
            ndvEstimator = null;
            break;
        } else {
            // check if all of the bit vectors can merge
            NumDistinctValueEstimator estimator = stringColumnStatsData.getNdvEstimator();
            if (ndvEstimator == null) {
                ndvEstimator = estimator;
            } else {
                if (ndvEstimator.canMerge(estimator)) {
                    continue;
                } else {
                    ndvEstimator = null;
                    break;
                }
            }
        }
    }
    if (ndvEstimator != null) {
        ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator);
    }
    LOG.debug("all of the bit vectors can merge for " + colName + " is " + (ndvEstimator != null));
    ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
    if (doAllPartitionContainStats || colStatsWithSourceInfo.size() < 2) {
        StringColumnStatsDataInspector aggregateData = null;
        for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
            ColumnStatisticsObj cso = csp.getColStatsObj();
            StringColumnStatsDataInspector newData = stringInspectorFromStats(cso);
            if (ndvEstimator != null) {
                ndvEstimator.mergeEstimators(newData.getNdvEstimator());
            }
            if (aggregateData == null) {
                aggregateData = newData.deepCopy();
            } else {
                aggregateData.setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen()));
                aggregateData.setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen()));
                aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
            }
        }
        if (ndvEstimator != null) {
            // if all the ColumnStatisticsObjs contain bitvectors, we do not need to
            // use uniform distribution assumption because we can merge bitvectors
            // to get a good estimation.
            aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
        } else {
        // aggregateData already has the ndv of the max of all
        }
        columnStatisticsData.setStringStats(aggregateData);
    } else {
        // we need extrapolation
        LOG.debug("start extrapolation for " + colName);
        Map<String, Integer> indexMap = new HashMap<>();
        for (int index = 0; index < partNames.size(); index++) {
            indexMap.put(partNames.get(index), index);
        }
        Map<String, Double> adjustedIndexMap = new HashMap<>();
        Map<String, ColumnStatisticsData> adjustedStatsMap = new HashMap<>();
        if (ndvEstimator == null) {
            // the traditional extrapolation methods.
            for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
                ColumnStatisticsObj cso = csp.getColStatsObj();
                String partName = csp.getPartName();
                adjustedIndexMap.put(partName, (double) indexMap.get(partName));
                adjustedStatsMap.put(partName, cso.getStatsData());
            }
        } else {
            // we first merge all the adjacent bitvectors that we could merge and
            // derive new partition names and index.
            StringBuilder pseudoPartName = new StringBuilder();
            double pseudoIndexSum = 0;
            int length = 0;
            int curIndex = -1;
            StringColumnStatsDataInspector aggregateData = null;
            for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
                ColumnStatisticsObj cso = csp.getColStatsObj();
                String partName = csp.getPartName();
                StringColumnStatsDataInspector newData = stringInspectorFromStats(cso);
                // already checked it before.
                if (indexMap.get(partName) != curIndex) {
                    // There is bitvector, but it is not adjacent to the previous ones.
                    if (length > 0) {
                        // we have to set ndv
                        adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length);
                        aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
                        ColumnStatisticsData csd = new ColumnStatisticsData();
                        csd.setStringStats(aggregateData);
                        adjustedStatsMap.put(pseudoPartName.toString(), csd);
                        // reset everything
                        pseudoPartName = new StringBuilder();
                        pseudoIndexSum = 0;
                        length = 0;
                        ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator);
                    }
                    aggregateData = null;
                }
                curIndex = indexMap.get(partName);
                pseudoPartName.append(partName);
                pseudoIndexSum += curIndex;
                length++;
                curIndex++;
                if (aggregateData == null) {
                    aggregateData = newData.deepCopy();
                } else {
                    aggregateData.setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen()));
                    aggregateData.setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen()));
                    aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                }
                ndvEstimator.mergeEstimators(newData.getNdvEstimator());
            }
            if (length > 0) {
                // we have to set ndv
                adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length);
                aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
                ColumnStatisticsData csd = new ColumnStatisticsData();
                csd.setStringStats(aggregateData);
                adjustedStatsMap.put(pseudoPartName.toString(), csd);
            }
        }
        extrapolate(columnStatisticsData, partNames.size(), colStatsWithSourceInfo.size(), adjustedIndexMap, adjustedStatsMap, -1);
    }
    LOG.debug("Ndv estimatation for {} is {} # of partitions requested: {} # of partitions found: {}", colName, columnStatisticsData.getStringStats().getNumDVs(), partNames.size(), colStatsWithSourceInfo.size());
    statsObj.setStatsData(columnStatisticsData);
    return statsObj;
}
Also used : ColStatsObjWithSourceInfo(org.apache.hadoop.hive.metastore.utils.MetaStoreServerUtils.ColStatsObjWithSourceInfo) HashMap(java.util.HashMap) NumDistinctValueEstimator(org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) StringColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Example 13 with StringColumnStatsDataInspector

use of org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector in project hive by apache.

the class StatObjectConverter method fillColumnStatisticsData.

// JAVA
public static void fillColumnStatisticsData(String colType, ColumnStatisticsData data, Object llow, Object lhigh, Object dlow, Object dhigh, Object declow, Object dechigh, Object nulls, Object dist, Object bitVector, Object avglen, Object maxlen, Object trues, Object falses) throws MetaException {
    colType = colType.toLowerCase();
    if (colType.equals("boolean")) {
        BooleanColumnStatsData boolStats = new BooleanColumnStatsData();
        boolStats.setNumFalses(MetastoreDirectSqlUtils.extractSqlLong(falses));
        boolStats.setNumTrues(MetastoreDirectSqlUtils.extractSqlLong(trues));
        boolStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
        data.setBooleanStats(boolStats);
    } else if (colType.equals("string") || colType.startsWith("varchar") || colType.startsWith("char")) {
        StringColumnStatsDataInspector stringStats = new StringColumnStatsDataInspector();
        stringStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
        stringStats.setAvgColLen(MetastoreDirectSqlUtils.extractSqlDouble(avglen));
        stringStats.setMaxColLen(MetastoreDirectSqlUtils.extractSqlLong(maxlen));
        stringStats.setNumDVs(MetastoreDirectSqlUtils.extractSqlLong(dist));
        stringStats.setBitVectors(getBitVector(MetastoreDirectSqlUtils.extractSqlBlob(bitVector)));
        data.setStringStats(stringStats);
    } else if (colType.equals("binary")) {
        BinaryColumnStatsData binaryStats = new BinaryColumnStatsData();
        binaryStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
        binaryStats.setAvgColLen(MetastoreDirectSqlUtils.extractSqlDouble(avglen));
        binaryStats.setMaxColLen(MetastoreDirectSqlUtils.extractSqlLong(maxlen));
        data.setBinaryStats(binaryStats);
    } else if (colType.equals("bigint") || colType.equals("int") || colType.equals("smallint") || colType.equals("tinyint")) {
        LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector();
        longStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
        if (lhigh != null) {
            longStats.setHighValue(MetastoreDirectSqlUtils.extractSqlLong(lhigh));
        }
        if (llow != null) {
            longStats.setLowValue(MetastoreDirectSqlUtils.extractSqlLong(llow));
        }
        longStats.setNumDVs(MetastoreDirectSqlUtils.extractSqlLong(dist));
        longStats.setBitVectors(getBitVector(MetastoreDirectSqlUtils.extractSqlBlob(bitVector)));
        data.setLongStats(longStats);
    } else if (colType.equals("double") || colType.equals("float")) {
        DoubleColumnStatsDataInspector doubleStats = new DoubleColumnStatsDataInspector();
        doubleStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
        if (dhigh != null) {
            doubleStats.setHighValue(MetastoreDirectSqlUtils.extractSqlDouble(dhigh));
        }
        if (dlow != null) {
            doubleStats.setLowValue(MetastoreDirectSqlUtils.extractSqlDouble(dlow));
        }
        doubleStats.setNumDVs(MetastoreDirectSqlUtils.extractSqlLong(dist));
        doubleStats.setBitVectors(getBitVector(MetastoreDirectSqlUtils.extractSqlBlob(bitVector)));
        data.setDoubleStats(doubleStats);
    } else if (colType.startsWith("decimal")) {
        DecimalColumnStatsDataInspector decimalStats = new DecimalColumnStatsDataInspector();
        decimalStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
        if (dechigh != null) {
            decimalStats.setHighValue(DecimalUtils.createThriftDecimal((String) dechigh));
        }
        if (declow != null) {
            decimalStats.setLowValue(DecimalUtils.createThriftDecimal((String) declow));
        }
        decimalStats.setNumDVs(MetastoreDirectSqlUtils.extractSqlLong(dist));
        decimalStats.setBitVectors(getBitVector(MetastoreDirectSqlUtils.extractSqlBlob(bitVector)));
        data.setDecimalStats(decimalStats);
    } else if (colType.equals("date")) {
        DateColumnStatsDataInspector dateStats = new DateColumnStatsDataInspector();
        dateStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
        if (lhigh != null) {
            dateStats.setHighValue(new Date(MetastoreDirectSqlUtils.extractSqlLong(lhigh)));
        }
        if (llow != null) {
            dateStats.setLowValue(new Date(MetastoreDirectSqlUtils.extractSqlLong(llow)));
        }
        dateStats.setNumDVs(MetastoreDirectSqlUtils.extractSqlLong(dist));
        dateStats.setBitVectors(getBitVector(MetastoreDirectSqlUtils.extractSqlBlob(bitVector)));
        data.setDateStats(dateStats);
    } else if (colType.equals("timestamp")) {
        TimestampColumnStatsDataInspector timestampStats = new TimestampColumnStatsDataInspector();
        timestampStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
        if (lhigh != null) {
            timestampStats.setHighValue(new Timestamp(MetastoreDirectSqlUtils.extractSqlLong(lhigh)));
        }
        if (llow != null) {
            timestampStats.setLowValue(new Timestamp(MetastoreDirectSqlUtils.extractSqlLong(llow)));
        }
        timestampStats.setNumDVs(MetastoreDirectSqlUtils.extractSqlLong(dist));
        timestampStats.setBitVectors(getBitVector(MetastoreDirectSqlUtils.extractSqlBlob(bitVector)));
        data.setTimestampStats(timestampStats);
    }
}
Also used : BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) DecimalColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector) DoubleColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector) LongColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector) DateColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector) TimestampColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.TimestampColumnStatsDataInspector) StringColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector) Timestamp(org.apache.hadoop.hive.metastore.api.Timestamp) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData) Date(org.apache.hadoop.hive.metastore.api.Date)

Aggregations

StringColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector)13 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)10 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)10 LongColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector)10 BinaryColumnStatsData (org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)9 DateColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector)9 DecimalColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector)9 DoubleColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector)9 TimestampColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.TimestampColumnStatsDataInspector)8 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)7 Date (org.apache.hadoop.hive.metastore.api.Date)4 Timestamp (org.apache.hadoop.hive.metastore.api.Timestamp)4 BigDecimal (java.math.BigDecimal)2 HashMap (java.util.HashMap)2 Map (java.util.Map)2 NumDistinctValueEstimator (org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator)2 ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)2 ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)2 PrimitiveObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector)2 StringObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector)2