Search in sources :

Example 1 with StringColumnStatsDataInspector

use of org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector in project hive by apache.

the class ColumnStatsUpdateTask method constructColumnStatsFromInput.

private ColumnStatistics constructColumnStatsFromInput() throws SemanticException, MetaException {
    // If we are replicating the stats, we don't need to construct those again.
    if (work.getColStats() != null) {
        ColumnStatistics colStats = work.getColStats();
        LOG.debug("Got stats through replication for " + colStats.getStatsDesc().getDbName() + "." + colStats.getStatsDesc().getTableName());
        return colStats;
    }
    String dbName = work.dbName();
    String tableName = work.getTableName();
    String partName = work.getPartName();
    String colName = work.getColName();
    String columnType = work.getColType();
    ColumnStatisticsObj statsObj = new ColumnStatisticsObj();
    // grammar prohibits more than 1 column so we are guaranteed to have only 1
    // element in this lists.
    statsObj.setColName(colName);
    statsObj.setColType(columnType);
    ColumnStatisticsData statsData = new ColumnStatisticsData();
    if (columnType.equalsIgnoreCase("long") || columnType.equalsIgnoreCase("tinyint") || columnType.equalsIgnoreCase("smallint") || columnType.equalsIgnoreCase("int") || columnType.equalsIgnoreCase("bigint")) {
        LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector();
        longStats.setNumNullsIsSet(false);
        longStats.setNumDVsIsSet(false);
        longStats.setLowValueIsSet(false);
        longStats.setHighValueIsSet(false);
        Map<String, String> mapProp = work.getMapProp();
        for (Entry<String, String> entry : mapProp.entrySet()) {
            String fName = entry.getKey();
            String value = entry.getValue();
            if (fName.equals("numNulls")) {
                longStats.setNumNulls(Long.parseLong(value));
            } else if (fName.equals("numDVs")) {
                longStats.setNumDVs(Long.parseLong(value));
            } else if (fName.equals("lowValue")) {
                longStats.setLowValue(Long.parseLong(value));
            } else if (fName.equals("highValue")) {
                longStats.setHighValue(Long.parseLong(value));
            } else {
                throw new SemanticException("Unknown stat");
            }
        }
        statsData.setLongStats(longStats);
        statsObj.setStatsData(statsData);
    } else if (columnType.equalsIgnoreCase("double") || columnType.equalsIgnoreCase("float")) {
        DoubleColumnStatsDataInspector doubleStats = new DoubleColumnStatsDataInspector();
        doubleStats.setNumNullsIsSet(false);
        doubleStats.setNumDVsIsSet(false);
        doubleStats.setLowValueIsSet(false);
        doubleStats.setHighValueIsSet(false);
        Map<String, String> mapProp = work.getMapProp();
        for (Entry<String, String> entry : mapProp.entrySet()) {
            String fName = entry.getKey();
            String value = entry.getValue();
            if (fName.equals("numNulls")) {
                doubleStats.setNumNulls(Long.parseLong(value));
            } else if (fName.equals("numDVs")) {
                doubleStats.setNumDVs(Long.parseLong(value));
            } else if (fName.equals("lowValue")) {
                doubleStats.setLowValue(Double.parseDouble(value));
            } else if (fName.equals("highValue")) {
                doubleStats.setHighValue(Double.parseDouble(value));
            } else {
                throw new SemanticException("Unknown stat");
            }
        }
        statsData.setDoubleStats(doubleStats);
        statsObj.setStatsData(statsData);
    } else if (columnType.equalsIgnoreCase("string") || columnType.toLowerCase().startsWith("char") || columnType.toLowerCase().startsWith("varchar")) {
        // char(x),varchar(x) types
        StringColumnStatsDataInspector stringStats = new StringColumnStatsDataInspector();
        stringStats.setMaxColLenIsSet(false);
        stringStats.setAvgColLenIsSet(false);
        stringStats.setNumNullsIsSet(false);
        stringStats.setNumDVsIsSet(false);
        Map<String, String> mapProp = work.getMapProp();
        for (Entry<String, String> entry : mapProp.entrySet()) {
            String fName = entry.getKey();
            String value = entry.getValue();
            if (fName.equals("numNulls")) {
                stringStats.setNumNulls(Long.parseLong(value));
            } else if (fName.equals("numDVs")) {
                stringStats.setNumDVs(Long.parseLong(value));
            } else if (fName.equals("avgColLen")) {
                stringStats.setAvgColLen(Double.parseDouble(value));
            } else if (fName.equals("maxColLen")) {
                stringStats.setMaxColLen(Long.parseLong(value));
            } else {
                throw new SemanticException("Unknown stat");
            }
        }
        statsData.setStringStats(stringStats);
        statsObj.setStatsData(statsData);
    } else if (columnType.equalsIgnoreCase("boolean")) {
        BooleanColumnStatsData booleanStats = new BooleanColumnStatsData();
        booleanStats.setNumNullsIsSet(false);
        booleanStats.setNumTruesIsSet(false);
        booleanStats.setNumFalsesIsSet(false);
        Map<String, String> mapProp = work.getMapProp();
        for (Entry<String, String> entry : mapProp.entrySet()) {
            String fName = entry.getKey();
            String value = entry.getValue();
            if (fName.equals("numNulls")) {
                booleanStats.setNumNulls(Long.parseLong(value));
            } else if (fName.equals("numTrues")) {
                booleanStats.setNumTrues(Long.parseLong(value));
            } else if (fName.equals("numFalses")) {
                booleanStats.setNumFalses(Long.parseLong(value));
            } else {
                throw new SemanticException("Unknown stat");
            }
        }
        statsData.setBooleanStats(booleanStats);
        statsObj.setStatsData(statsData);
    } else if (columnType.equalsIgnoreCase("binary")) {
        BinaryColumnStatsData binaryStats = new BinaryColumnStatsData();
        binaryStats.setNumNullsIsSet(false);
        binaryStats.setAvgColLenIsSet(false);
        binaryStats.setMaxColLenIsSet(false);
        Map<String, String> mapProp = work.getMapProp();
        for (Entry<String, String> entry : mapProp.entrySet()) {
            String fName = entry.getKey();
            String value = entry.getValue();
            if (fName.equals("numNulls")) {
                binaryStats.setNumNulls(Long.parseLong(value));
            } else if (fName.equals("avgColLen")) {
                binaryStats.setAvgColLen(Double.parseDouble(value));
            } else if (fName.equals("maxColLen")) {
                binaryStats.setMaxColLen(Long.parseLong(value));
            } else {
                throw new SemanticException("Unknown stat");
            }
        }
        statsData.setBinaryStats(binaryStats);
        statsObj.setStatsData(statsData);
    } else if (columnType.toLowerCase().startsWith("decimal")) {
        // decimal(a,b) type
        DecimalColumnStatsDataInspector decimalStats = new DecimalColumnStatsDataInspector();
        decimalStats.setNumNullsIsSet(false);
        decimalStats.setNumDVsIsSet(false);
        decimalStats.setLowValueIsSet(false);
        decimalStats.setHighValueIsSet(false);
        Map<String, String> mapProp = work.getMapProp();
        for (Entry<String, String> entry : mapProp.entrySet()) {
            String fName = entry.getKey();
            String value = entry.getValue();
            if (fName.equals("numNulls")) {
                decimalStats.setNumNulls(Long.parseLong(value));
            } else if (fName.equals("numDVs")) {
                decimalStats.setNumDVs(Long.parseLong(value));
            } else if (fName.equals("lowValue")) {
                BigDecimal d = new BigDecimal(value);
                decimalStats.setLowValue(DecimalUtils.getDecimal(ByteBuffer.wrap(d.unscaledValue().toByteArray()), (short) d.scale()));
            } else if (fName.equals("highValue")) {
                BigDecimal d = new BigDecimal(value);
                decimalStats.setHighValue(DecimalUtils.getDecimal(ByteBuffer.wrap(d.unscaledValue().toByteArray()), (short) d.scale()));
            } else {
                throw new SemanticException("Unknown stat");
            }
        }
        statsData.setDecimalStats(decimalStats);
        statsObj.setStatsData(statsData);
    } else if (columnType.equalsIgnoreCase("date")) {
        DateColumnStatsDataInspector dateStats = new DateColumnStatsDataInspector();
        Map<String, String> mapProp = work.getMapProp();
        for (Entry<String, String> entry : mapProp.entrySet()) {
            String fName = entry.getKey();
            String value = entry.getValue();
            if (fName.equals("numNulls")) {
                dateStats.setNumNulls(Long.parseLong(value));
            } else if (fName.equals("numDVs")) {
                dateStats.setNumDVs(Long.parseLong(value));
            } else if (fName.equals("lowValue")) {
                // Date high/low value is stored as long in stats DB, but allow users to set high/low
                // value using either date format (yyyy-mm-dd) or numeric format (days since epoch)
                dateStats.setLowValue(readDateValue(value));
            } else if (fName.equals("highValue")) {
                dateStats.setHighValue(readDateValue(value));
            } else {
                throw new SemanticException("Unknown stat");
            }
        }
        statsData.setDateStats(dateStats);
        statsObj.setStatsData(statsData);
    } else if (columnType.equalsIgnoreCase("timestamp")) {
        TimestampColumnStatsDataInspector timestampStats = new TimestampColumnStatsDataInspector();
        Map<String, String> mapProp = work.getMapProp();
        for (Entry<String, String> entry : mapProp.entrySet()) {
            String fName = entry.getKey();
            String value = entry.getValue();
            if (fName.equals("numNulls")) {
                timestampStats.setNumNulls(Long.parseLong(value));
            } else if (fName.equals("numDVs")) {
                timestampStats.setNumDVs(Long.parseLong(value));
            } else if (fName.equals("lowValue")) {
                timestampStats.setLowValue(readTimestampValue(value));
            } else if (fName.equals("highValue")) {
                timestampStats.setHighValue(readTimestampValue(value));
            } else {
                throw new SemanticException("Unknown stat");
            }
        }
        statsData.setTimestampStats(timestampStats);
        statsObj.setStatsData(statsData);
    } else {
        throw new SemanticException("Unsupported type");
    }
    ColumnStatisticsDesc statsDesc = getColumnStatsDesc(dbName, tableName, partName, partName == null);
    ColumnStatistics colStat = new ColumnStatistics();
    colStat.setStatsDesc(statsDesc);
    colStat.addToStatsObj(statsObj);
    colStat.setEngine(Constants.HIVE_ENGINE);
    return colStat;
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) DateColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData) BigDecimal(java.math.BigDecimal) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) Entry(java.util.Map.Entry) DecimalColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector) DoubleColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) LongColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector) TimestampColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.TimestampColumnStatsDataInspector) Map(java.util.Map) StringColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 2 with StringColumnStatsDataInspector

use of org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector in project hive by apache.

the class StatObjectConverter method getTableColumnStatisticsObj.

public static ColumnStatisticsObj getTableColumnStatisticsObj(MTableColumnStatistics mStatsObj, boolean enableBitVector) {
    ColumnStatisticsObj statsObj = new ColumnStatisticsObj();
    statsObj.setColType(mStatsObj.getColType());
    statsObj.setColName(mStatsObj.getColName());
    String colType = mStatsObj.getColType().toLowerCase();
    ColumnStatisticsData colStatsData = new ColumnStatisticsData();
    if (colType.equals("boolean")) {
        BooleanColumnStatsData boolStats = new BooleanColumnStatsData();
        boolStats.setNumFalses(mStatsObj.getNumFalses());
        boolStats.setNumTrues(mStatsObj.getNumTrues());
        boolStats.setNumNulls(mStatsObj.getNumNulls());
        colStatsData.setBooleanStats(boolStats);
    } else if (colType.equals("string") || colType.startsWith("varchar") || colType.startsWith("char")) {
        StringColumnStatsDataInspector stringStats = new StringColumnStatsDataInspector();
        stringStats.setNumNulls(mStatsObj.getNumNulls());
        stringStats.setAvgColLen(mStatsObj.getAvgColLen());
        stringStats.setMaxColLen(mStatsObj.getMaxColLen());
        stringStats.setNumDVs(mStatsObj.getNumDVs());
        stringStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
        colStatsData.setStringStats(stringStats);
    } else if (colType.equals("binary")) {
        BinaryColumnStatsData binaryStats = new BinaryColumnStatsData();
        binaryStats.setNumNulls(mStatsObj.getNumNulls());
        binaryStats.setAvgColLen(mStatsObj.getAvgColLen());
        binaryStats.setMaxColLen(mStatsObj.getMaxColLen());
        colStatsData.setBinaryStats(binaryStats);
    } else if (colType.equals("bigint") || colType.equals("int") || colType.equals("smallint") || colType.equals("tinyint")) {
        LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector();
        longStats.setNumNulls(mStatsObj.getNumNulls());
        Long longHighValue = mStatsObj.getLongHighValue();
        if (longHighValue != null) {
            longStats.setHighValue(longHighValue);
        }
        Long longLowValue = mStatsObj.getLongLowValue();
        if (longLowValue != null) {
            longStats.setLowValue(longLowValue);
        }
        longStats.setNumDVs(mStatsObj.getNumDVs());
        longStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
        colStatsData.setLongStats(longStats);
    } else if (colType.equals("double") || colType.equals("float")) {
        DoubleColumnStatsDataInspector doubleStats = new DoubleColumnStatsDataInspector();
        doubleStats.setNumNulls(mStatsObj.getNumNulls());
        Double doubleHighValue = mStatsObj.getDoubleHighValue();
        if (doubleHighValue != null) {
            doubleStats.setHighValue(doubleHighValue);
        }
        Double doubleLowValue = mStatsObj.getDoubleLowValue();
        if (doubleLowValue != null) {
            doubleStats.setLowValue(doubleLowValue);
        }
        doubleStats.setNumDVs(mStatsObj.getNumDVs());
        doubleStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
        colStatsData.setDoubleStats(doubleStats);
    } else if (colType.startsWith("decimal")) {
        DecimalColumnStatsDataInspector decimalStats = new DecimalColumnStatsDataInspector();
        decimalStats.setNumNulls(mStatsObj.getNumNulls());
        String decimalHighValue = mStatsObj.getDecimalHighValue();
        if (decimalHighValue != null) {
            decimalStats.setHighValue(DecimalUtils.createThriftDecimal(decimalHighValue));
        }
        String decimalLowValue = mStatsObj.getDecimalLowValue();
        if (decimalLowValue != null) {
            decimalStats.setLowValue(DecimalUtils.createThriftDecimal(decimalLowValue));
        }
        decimalStats.setNumDVs(mStatsObj.getNumDVs());
        decimalStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
        colStatsData.setDecimalStats(decimalStats);
    } else if (colType.equals("date")) {
        DateColumnStatsDataInspector dateStats = new DateColumnStatsDataInspector();
        dateStats.setNumNulls(mStatsObj.getNumNulls());
        Long highValue = mStatsObj.getLongHighValue();
        if (highValue != null) {
            dateStats.setHighValue(new Date(highValue));
        }
        Long lowValue = mStatsObj.getLongLowValue();
        if (lowValue != null) {
            dateStats.setLowValue(new Date(lowValue));
        }
        dateStats.setNumDVs(mStatsObj.getNumDVs());
        dateStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
        colStatsData.setDateStats(dateStats);
    } else if (colType.equals("timestamp")) {
        TimestampColumnStatsDataInspector timestampStats = new TimestampColumnStatsDataInspector();
        timestampStats.setNumNulls(mStatsObj.getNumNulls());
        Long highValue = mStatsObj.getLongHighValue();
        if (highValue != null) {
            timestampStats.setHighValue(new Timestamp(highValue));
        }
        Long lowValue = mStatsObj.getLongLowValue();
        if (lowValue != null) {
            timestampStats.setLowValue(new Timestamp(lowValue));
        }
        timestampStats.setNumDVs(mStatsObj.getNumDVs());
        timestampStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
        colStatsData.setTimestampStats(timestampStats);
    }
    statsObj.setStatsData(colStatsData);
    return statsObj;
}
Also used : BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) DateColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector) Timestamp(org.apache.hadoop.hive.metastore.api.Timestamp) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData) Date(org.apache.hadoop.hive.metastore.api.Date) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) DecimalColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector) DoubleColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector) LongColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector) TimestampColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.TimestampColumnStatsDataInspector) StringColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Example 3 with StringColumnStatsDataInspector

use of org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector in project hive by apache.

the class StatObjectConverter method fillColumnStatisticsData.

// DB
public static void fillColumnStatisticsData(String colType, ColumnStatisticsData data, Object llow, Object lhigh, Object dlow, Object dhigh, Object declow, Object dechigh, Object nulls, Object dist, Object avglen, Object maxlen, Object trues, Object falses, Object avgLong, Object avgDouble, Object avgDecimal, Object sumDist, boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException {
    colType = colType.toLowerCase();
    if (colType.equals("boolean")) {
        BooleanColumnStatsData boolStats = new BooleanColumnStatsData();
        boolStats.setNumFalses(MetastoreDirectSqlUtils.extractSqlLong(falses));
        boolStats.setNumTrues(MetastoreDirectSqlUtils.extractSqlLong(trues));
        boolStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
        data.setBooleanStats(boolStats);
    } else if (colType.equals("string") || colType.startsWith("varchar") || colType.startsWith("char")) {
        StringColumnStatsDataInspector stringStats = new StringColumnStatsDataInspector();
        stringStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
        stringStats.setAvgColLen(MetastoreDirectSqlUtils.extractSqlDouble(avglen));
        stringStats.setMaxColLen(MetastoreDirectSqlUtils.extractSqlLong(maxlen));
        stringStats.setNumDVs(MetastoreDirectSqlUtils.extractSqlLong(dist));
        data.setStringStats(stringStats);
    } else if (colType.equals("binary")) {
        BinaryColumnStatsData binaryStats = new BinaryColumnStatsData();
        binaryStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
        binaryStats.setAvgColLen(MetastoreDirectSqlUtils.extractSqlDouble(avglen));
        binaryStats.setMaxColLen(MetastoreDirectSqlUtils.extractSqlLong(maxlen));
        data.setBinaryStats(binaryStats);
    } else if (colType.equals("bigint") || colType.equals("int") || colType.equals("smallint") || colType.equals("tinyint")) {
        LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector();
        longStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
        if (lhigh != null) {
            longStats.setHighValue(MetastoreDirectSqlUtils.extractSqlLong(lhigh));
        }
        if (llow != null) {
            longStats.setLowValue(MetastoreDirectSqlUtils.extractSqlLong(llow));
        }
        long lowerBound = MetastoreDirectSqlUtils.extractSqlLong(dist);
        long higherBound = MetastoreDirectSqlUtils.extractSqlLong(sumDist);
        long rangeBound = Long.MAX_VALUE;
        if (lhigh != null && llow != null) {
            rangeBound = MetastoreDirectSqlUtils.extractSqlLong(lhigh) - MetastoreDirectSqlUtils.extractSqlLong(llow) + 1;
        }
        long estimation;
        if (useDensityFunctionForNDVEstimation && lhigh != null && llow != null && avgLong != null && MetastoreDirectSqlUtils.extractSqlDouble(avgLong) != 0.0) {
            // We have estimation, lowerbound and higherbound. We use estimation if
            // it is between lowerbound and higherbound.
            estimation = MetastoreDirectSqlUtils.extractSqlLong((MetastoreDirectSqlUtils.extractSqlLong(lhigh) - MetastoreDirectSqlUtils.extractSqlLong(llow)) / MetastoreDirectSqlUtils.extractSqlDouble(avgLong));
            if (estimation < lowerBound) {
                estimation = lowerBound;
            } else if (estimation > higherBound) {
                estimation = higherBound;
            }
        } else {
            estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner);
        }
        estimation = Math.min(estimation, rangeBound);
        longStats.setNumDVs(estimation);
        data.setLongStats(longStats);
    } else if (colType.equals("date")) {
        DateColumnStatsDataInspector dateStats = new DateColumnStatsDataInspector();
        dateStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
        if (lhigh != null) {
            dateStats.setHighValue(new Date(MetastoreDirectSqlUtils.extractSqlLong(lhigh)));
        }
        if (llow != null) {
            dateStats.setLowValue(new Date(MetastoreDirectSqlUtils.extractSqlLong(llow)));
        }
        long lowerBound = MetastoreDirectSqlUtils.extractSqlLong(dist);
        long higherBound = MetastoreDirectSqlUtils.extractSqlLong(sumDist);
        long rangeBound = Long.MAX_VALUE;
        if (lhigh != null && llow != null) {
            rangeBound = MetastoreDirectSqlUtils.extractSqlLong(lhigh) - MetastoreDirectSqlUtils.extractSqlLong(llow) + 1;
        }
        long estimation;
        if (useDensityFunctionForNDVEstimation && lhigh != null && llow != null && avgLong != null && MetastoreDirectSqlUtils.extractSqlDouble(avgLong) != 0.0) {
            // We have estimation, lowerbound and higherbound. We use estimation if
            // it is between lowerbound and higherbound.
            estimation = MetastoreDirectSqlUtils.extractSqlLong((MetastoreDirectSqlUtils.extractSqlLong(lhigh) - MetastoreDirectSqlUtils.extractSqlLong(llow)) / MetastoreDirectSqlUtils.extractSqlDouble(avgLong));
            if (estimation < lowerBound) {
                estimation = lowerBound;
            } else if (estimation > higherBound) {
                estimation = higherBound;
            }
        } else {
            estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner);
        }
        estimation = Math.min(estimation, rangeBound);
        dateStats.setNumDVs(estimation);
        data.setDateStats(dateStats);
    } else if (colType.equals("timestamp")) {
        TimestampColumnStatsDataInspector timestampStats = new TimestampColumnStatsDataInspector();
        timestampStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
        if (lhigh != null) {
            timestampStats.setHighValue(new Timestamp(MetastoreDirectSqlUtils.extractSqlLong(lhigh)));
        }
        if (llow != null) {
            timestampStats.setLowValue(new Timestamp(MetastoreDirectSqlUtils.extractSqlLong(llow)));
        }
        long lowerBound = MetastoreDirectSqlUtils.extractSqlLong(dist);
        long higherBound = MetastoreDirectSqlUtils.extractSqlLong(sumDist);
        long rangeBound = Long.MAX_VALUE;
        if (lhigh != null && llow != null) {
            rangeBound = MetastoreDirectSqlUtils.extractSqlLong(lhigh) - MetastoreDirectSqlUtils.extractSqlLong(llow) + 1;
        }
        long estimation;
        if (useDensityFunctionForNDVEstimation && lhigh != null && llow != null && avgLong != null && MetastoreDirectSqlUtils.extractSqlDouble(avgLong) != 0.0) {
            // We have estimation, lowerbound and higherbound. We use estimation if
            // it is between lowerbound and higherbound.
            estimation = MetastoreDirectSqlUtils.extractSqlLong((MetastoreDirectSqlUtils.extractSqlLong(lhigh) - MetastoreDirectSqlUtils.extractSqlLong(llow)) / MetastoreDirectSqlUtils.extractSqlDouble(avgLong));
            if (estimation < lowerBound) {
                estimation = lowerBound;
            } else if (estimation > higherBound) {
                estimation = higherBound;
            }
        } else {
            estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner);
        }
        estimation = Math.min(estimation, rangeBound);
        timestampStats.setNumDVs(estimation);
        data.setTimestampStats(timestampStats);
    } else if (colType.equals("double") || colType.equals("float")) {
        DoubleColumnStatsDataInspector doubleStats = new DoubleColumnStatsDataInspector();
        doubleStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
        if (dhigh != null) {
            doubleStats.setHighValue(MetastoreDirectSqlUtils.extractSqlDouble(dhigh));
        }
        if (dlow != null) {
            doubleStats.setLowValue(MetastoreDirectSqlUtils.extractSqlDouble(dlow));
        }
        long lowerBound = MetastoreDirectSqlUtils.extractSqlLong(dist);
        long higherBound = MetastoreDirectSqlUtils.extractSqlLong(sumDist);
        if (useDensityFunctionForNDVEstimation && dhigh != null && dlow != null && avgDouble != null && MetastoreDirectSqlUtils.extractSqlDouble(avgDouble) != 0.0) {
            long estimation = MetastoreDirectSqlUtils.extractSqlLong((MetastoreDirectSqlUtils.extractSqlLong(dhigh) - MetastoreDirectSqlUtils.extractSqlLong(dlow)) / MetastoreDirectSqlUtils.extractSqlDouble(avgDouble));
            if (estimation < lowerBound) {
                doubleStats.setNumDVs(lowerBound);
            } else if (estimation > higherBound) {
                doubleStats.setNumDVs(higherBound);
            } else {
                doubleStats.setNumDVs(estimation);
            }
        } else {
            doubleStats.setNumDVs((long) (lowerBound + (higherBound - lowerBound) * ndvTuner));
        }
        data.setDoubleStats(doubleStats);
    } else if (colType.startsWith("decimal")) {
        DecimalColumnStatsDataInspector decimalStats = new DecimalColumnStatsDataInspector();
        decimalStats.setNumNulls(MetastoreDirectSqlUtils.extractSqlLong(nulls));
        Decimal low = null;
        Decimal high = null;
        BigDecimal blow = null;
        BigDecimal bhigh = null;
        if (dechigh instanceof BigDecimal) {
            bhigh = (BigDecimal) dechigh;
            high = DecimalUtils.getDecimal(ByteBuffer.wrap(bhigh.unscaledValue().toByteArray()), (short) bhigh.scale());
        } else if (dechigh instanceof String) {
            bhigh = new BigDecimal((String) dechigh);
            high = DecimalUtils.createThriftDecimal((String) dechigh);
        }
        decimalStats.setHighValue(high);
        if (declow instanceof BigDecimal) {
            blow = (BigDecimal) declow;
            low = DecimalUtils.getDecimal(ByteBuffer.wrap(blow.unscaledValue().toByteArray()), (short) blow.scale());
        } else if (dechigh instanceof String) {
            blow = new BigDecimal((String) declow);
            low = DecimalUtils.createThriftDecimal((String) declow);
        }
        decimalStats.setLowValue(low);
        long lowerBound = MetastoreDirectSqlUtils.extractSqlLong(dist);
        long higherBound = MetastoreDirectSqlUtils.extractSqlLong(sumDist);
        if (useDensityFunctionForNDVEstimation && dechigh != null && declow != null && avgDecimal != null && MetastoreDirectSqlUtils.extractSqlDouble(avgDecimal) != 0.0) {
            long estimation = MetastoreDirectSqlUtils.extractSqlLong(MetastoreDirectSqlUtils.extractSqlLong(bhigh.subtract(blow).floatValue() / MetastoreDirectSqlUtils.extractSqlDouble(avgDecimal)));
            if (estimation < lowerBound) {
                decimalStats.setNumDVs(lowerBound);
            } else if (estimation > higherBound) {
                decimalStats.setNumDVs(higherBound);
            } else {
                decimalStats.setNumDVs(estimation);
            }
        } else {
            decimalStats.setNumDVs((long) (lowerBound + (higherBound - lowerBound) * ndvTuner));
        }
        data.setDecimalStats(decimalStats);
    }
}
Also used : BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) DateColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector) Timestamp(org.apache.hadoop.hive.metastore.api.Timestamp) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData) Date(org.apache.hadoop.hive.metastore.api.Date) BigDecimal(java.math.BigDecimal) DecimalColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector) BigDecimal(java.math.BigDecimal) Decimal(org.apache.hadoop.hive.metastore.api.Decimal) DoubleColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector) LongColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector) TimestampColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.TimestampColumnStatsDataInspector) StringColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector)

Example 4 with StringColumnStatsDataInspector

use of org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector in project hive by apache.

the class StatObjectConverter method getPartitionColumnStatisticsObj.

public static ColumnStatisticsObj getPartitionColumnStatisticsObj(MPartitionColumnStatistics mStatsObj, boolean enableBitVector) {
    ColumnStatisticsObj statsObj = new ColumnStatisticsObj();
    statsObj.setColType(mStatsObj.getColType());
    statsObj.setColName(mStatsObj.getColName());
    String colType = mStatsObj.getColType().toLowerCase();
    ColumnStatisticsData colStatsData = new ColumnStatisticsData();
    if (colType.equals("boolean")) {
        BooleanColumnStatsData boolStats = new BooleanColumnStatsData();
        boolStats.setNumFalses(mStatsObj.getNumFalses());
        boolStats.setNumTrues(mStatsObj.getNumTrues());
        boolStats.setNumNulls(mStatsObj.getNumNulls());
        colStatsData.setBooleanStats(boolStats);
    } else if (colType.equals("string") || colType.startsWith("varchar") || colType.startsWith("char")) {
        StringColumnStatsDataInspector stringStats = new StringColumnStatsDataInspector();
        stringStats.setNumNulls(mStatsObj.getNumNulls());
        stringStats.setAvgColLen(mStatsObj.getAvgColLen());
        stringStats.setMaxColLen(mStatsObj.getMaxColLen());
        stringStats.setNumDVs(mStatsObj.getNumDVs());
        stringStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
        colStatsData.setStringStats(stringStats);
    } else if (colType.equals("binary")) {
        BinaryColumnStatsData binaryStats = new BinaryColumnStatsData();
        binaryStats.setNumNulls(mStatsObj.getNumNulls());
        binaryStats.setAvgColLen(mStatsObj.getAvgColLen());
        binaryStats.setMaxColLen(mStatsObj.getMaxColLen());
        colStatsData.setBinaryStats(binaryStats);
    } else if (colType.equals("tinyint") || colType.equals("smallint") || colType.equals("int") || colType.equals("bigint")) {
        LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector();
        longStats.setNumNulls(mStatsObj.getNumNulls());
        if (mStatsObj.getLongHighValue() != null) {
            longStats.setHighValue(mStatsObj.getLongHighValue());
        }
        if (mStatsObj.getLongLowValue() != null) {
            longStats.setLowValue(mStatsObj.getLongLowValue());
        }
        longStats.setNumDVs(mStatsObj.getNumDVs());
        longStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
        colStatsData.setLongStats(longStats);
    } else if (colType.equals("double") || colType.equals("float")) {
        DoubleColumnStatsDataInspector doubleStats = new DoubleColumnStatsDataInspector();
        doubleStats.setNumNulls(mStatsObj.getNumNulls());
        if (mStatsObj.getDoubleHighValue() != null) {
            doubleStats.setHighValue(mStatsObj.getDoubleHighValue());
        }
        if (mStatsObj.getDoubleLowValue() != null) {
            doubleStats.setLowValue(mStatsObj.getDoubleLowValue());
        }
        doubleStats.setNumDVs(mStatsObj.getNumDVs());
        doubleStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
        colStatsData.setDoubleStats(doubleStats);
    } else if (colType.startsWith("decimal")) {
        DecimalColumnStatsDataInspector decimalStats = new DecimalColumnStatsDataInspector();
        decimalStats.setNumNulls(mStatsObj.getNumNulls());
        if (mStatsObj.getDecimalHighValue() != null) {
            decimalStats.setHighValue(DecimalUtils.createThriftDecimal(mStatsObj.getDecimalHighValue()));
        }
        if (mStatsObj.getDecimalLowValue() != null) {
            decimalStats.setLowValue(DecimalUtils.createThriftDecimal(mStatsObj.getDecimalLowValue()));
        }
        decimalStats.setNumDVs(mStatsObj.getNumDVs());
        decimalStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
        colStatsData.setDecimalStats(decimalStats);
    } else if (colType.equals("date")) {
        DateColumnStatsDataInspector dateStats = new DateColumnStatsDataInspector();
        dateStats.setNumNulls(mStatsObj.getNumNulls());
        Long highValue = mStatsObj.getLongHighValue();
        if (highValue != null) {
            dateStats.setHighValue(new Date(highValue));
        }
        Long lowValue = mStatsObj.getLongLowValue();
        if (lowValue != null) {
            dateStats.setLowValue(new Date(lowValue));
        }
        dateStats.setNumDVs(mStatsObj.getNumDVs());
        dateStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
        colStatsData.setDateStats(dateStats);
    } else if (colType.equals("timestamp")) {
        TimestampColumnStatsDataInspector timestampStats = new TimestampColumnStatsDataInspector();
        timestampStats.setNumNulls(mStatsObj.getNumNulls());
        Long highValue = mStatsObj.getLongHighValue();
        if (highValue != null) {
            timestampStats.setHighValue(new Timestamp(highValue));
        }
        Long lowValue = mStatsObj.getLongLowValue();
        if (lowValue != null) {
            timestampStats.setLowValue(new Timestamp(lowValue));
        }
        timestampStats.setNumDVs(mStatsObj.getNumDVs());
        timestampStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
        colStatsData.setTimestampStats(timestampStats);
    }
    statsObj.setStatsData(colStatsData);
    return statsObj;
}
Also used : BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) DateColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector) Timestamp(org.apache.hadoop.hive.metastore.api.Timestamp) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData) Date(org.apache.hadoop.hive.metastore.api.Date) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) DecimalColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector) DoubleColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector) LongColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector) TimestampColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.TimestampColumnStatsDataInspector) StringColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Example 5 with StringColumnStatsDataInspector

use of org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector in project hive by apache.

the class StringColumnStatsMerger method merge.

@Override
public void merge(ColumnStatisticsObj aggregateColStats, ColumnStatisticsObj newColStats) {
    LOG.debug("Merging statistics: [aggregateColStats:{}, newColStats: {}]", aggregateColStats, newColStats);
    StringColumnStatsDataInspector aggregateData = stringInspectorFromStats(aggregateColStats);
    StringColumnStatsDataInspector newData = stringInspectorFromStats(newColStats);
    aggregateData.setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen()));
    aggregateData.setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen()));
    aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
    if (aggregateData.getNdvEstimator() == null || newData.getNdvEstimator() == null) {
        aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
    } else {
        NumDistinctValueEstimator oldEst = aggregateData.getNdvEstimator();
        NumDistinctValueEstimator newEst = newData.getNdvEstimator();
        final long ndv;
        if (oldEst.canMerge(newEst)) {
            oldEst.mergeEstimators(newEst);
            ndv = oldEst.estimateNumDistinctValues();
            aggregateData.setNdvEstimator(oldEst);
        } else {
            ndv = Math.max(aggregateData.getNumDVs(), newData.getNumDVs());
        }
        LOG.debug("Use bitvector to merge column {}'s ndvs of {} and {} to be {}", aggregateColStats.getColName(), aggregateData.getNumDVs(), newData.getNumDVs(), ndv);
        aggregateData.setNumDVs(ndv);
    }
    aggregateColStats.getStatsData().setStringStats(aggregateData);
}
Also used : StringColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector) NumDistinctValueEstimator(org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator)

Aggregations

StringColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector)13 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)10 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)10 LongColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector)10 BinaryColumnStatsData (org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)9 DateColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector)9 DecimalColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector)9 DoubleColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector)9 TimestampColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.TimestampColumnStatsDataInspector)8 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)7 Date (org.apache.hadoop.hive.metastore.api.Date)4 Timestamp (org.apache.hadoop.hive.metastore.api.Timestamp)4 BigDecimal (java.math.BigDecimal)2 HashMap (java.util.HashMap)2 Map (java.util.Map)2 NumDistinctValueEstimator (org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator)2 ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)2 ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)2 PrimitiveObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector)2 StringObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector)2