Search in sources :

Example 6 with DateColumnStatsDataInspector

use of org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector in project hive by apache.

the class ColumnStatsMergerFactory method newColumnStaticsObj.

public static ColumnStatisticsObj newColumnStaticsObj(final String colName, final String colType, final _Fields type) {
    final ColumnStatisticsObj cso = new ColumnStatisticsObj();
    final ColumnStatisticsData csd = new ColumnStatisticsData();
    Objects.requireNonNull(colName, "Column name cannot be null");
    Objects.requireNonNull(colType, "Column type cannot be null");
    Objects.requireNonNull(type, "Field type cannot be null");
    switch(type) {
        case BOOLEAN_STATS:
            csd.setBooleanStats(new BooleanColumnStatsData());
            break;
        case LONG_STATS:
            csd.setLongStats(new LongColumnStatsDataInspector());
            break;
        case DOUBLE_STATS:
            csd.setDoubleStats(new DoubleColumnStatsDataInspector());
            break;
        case STRING_STATS:
            csd.setStringStats(new StringColumnStatsDataInspector());
            break;
        case BINARY_STATS:
            csd.setBinaryStats(new BinaryColumnStatsData());
            break;
        case DECIMAL_STATS:
            csd.setDecimalStats(new DecimalColumnStatsDataInspector());
            break;
        case DATE_STATS:
            csd.setDateStats(new DateColumnStatsDataInspector());
            break;
        case TIMESTAMP_STATS:
            csd.setTimestampStats(new TimestampColumnStatsDataInspector());
            break;
        default:
            throw new IllegalArgumentException("Unknown stats type: " + type);
    }
    cso.setColName(colName);
    cso.setColType(colType);
    cso.setStatsData(csd);
    return cso;
}
Also used : BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) DecimalColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector) DoubleColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector) LongColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector) DateColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector) TimestampColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.TimestampColumnStatsDataInspector) StringColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)

Example 7 with DateColumnStatsDataInspector

use of org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector in project hive by apache.

the class ColumnStatisticsObjTranslator method unpackPrimitiveObject.

private static void unpackPrimitiveObject(ObjectInspector oi, Object o, String fieldName, ColumnStatisticsObj statsObj) throws UnsupportedDoubleException {
    if (o == null) {
        return;
    }
    // First infer the type of object
    if (fieldName.equals("columntype")) {
        PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi;
        String s = ((StringObjectInspector) poi).getPrimitiveJavaObject(o);
        ColumnStatisticsData statsData = new ColumnStatisticsData();
        if (s.equalsIgnoreCase("long")) {
            LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector();
            statsData.setLongStats(longStats);
            statsObj.setStatsData(statsData);
        } else if (s.equalsIgnoreCase("double")) {
            DoubleColumnStatsDataInspector doubleStats = new DoubleColumnStatsDataInspector();
            statsData.setDoubleStats(doubleStats);
            statsObj.setStatsData(statsData);
        } else if (s.equalsIgnoreCase("string")) {
            StringColumnStatsDataInspector stringStats = new StringColumnStatsDataInspector();
            statsData.setStringStats(stringStats);
            statsObj.setStatsData(statsData);
        } else if (s.equalsIgnoreCase("boolean")) {
            BooleanColumnStatsData booleanStats = new BooleanColumnStatsData();
            statsData.setBooleanStats(booleanStats);
            statsObj.setStatsData(statsData);
        } else if (s.equalsIgnoreCase("binary")) {
            BinaryColumnStatsData binaryStats = new BinaryColumnStatsData();
            statsData.setBinaryStats(binaryStats);
            statsObj.setStatsData(statsData);
        } else if (s.equalsIgnoreCase("decimal")) {
            DecimalColumnStatsDataInspector decimalStats = new DecimalColumnStatsDataInspector();
            statsData.setDecimalStats(decimalStats);
            statsObj.setStatsData(statsData);
        } else if (s.equalsIgnoreCase("date")) {
            DateColumnStatsDataInspector dateStats = new DateColumnStatsDataInspector();
            statsData.setDateStats(dateStats);
            statsObj.setStatsData(statsData);
        }
    } else {
        // invoke the right unpack method depending on data type of the column
        if (statsObj.getStatsData().isSetBooleanStats()) {
            unpackBooleanStats(oi, o, fieldName, statsObj);
        } else if (statsObj.getStatsData().isSetLongStats()) {
            unpackLongStats(oi, o, fieldName, statsObj);
        } else if (statsObj.getStatsData().isSetDoubleStats()) {
            unpackDoubleStats(oi, o, fieldName, statsObj);
        } else if (statsObj.getStatsData().isSetStringStats()) {
            unpackStringStats(oi, o, fieldName, statsObj);
        } else if (statsObj.getStatsData().isSetBinaryStats()) {
            unpackBinaryStats(oi, o, fieldName, statsObj);
        } else if (statsObj.getStatsData().isSetDecimalStats()) {
            unpackDecimalStats(oi, o, fieldName, statsObj);
        } else if (statsObj.getStatsData().isSetDateStats()) {
            unpackDateStats(oi, o, fieldName, statsObj);
        }
    }
}
Also used : BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) DecimalColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector) DoubleColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector) LongColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) DateColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) StringColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)

Example 8 with DateColumnStatsDataInspector

use of org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector in project hive by apache.

the class ColumnStatisticsObjTranslator method unpackPrimitiveObject.

private static void unpackPrimitiveObject(ObjectInspector oi, Object o, ColumnStatsField csf, ColumnStatisticsObj statsObj) throws UnsupportedDoubleException {
    if (o == null) {
        return;
    }
    // First infer the type of object
    if (csf == ColumnStatsField.COLUMN_STATS_TYPE) {
        PrimitiveObjectInspector poi = (PrimitiveObjectInspector) oi;
        String s = ((StringObjectInspector) poi).getPrimitiveJavaObject(o);
        ColumnStatisticsData statsData = new ColumnStatisticsData();
        if (s.equalsIgnoreCase(ColumnStatsType.LONG.toString())) {
            LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector();
            statsData.setLongStats(longStats);
            statsObj.setStatsData(statsData);
        } else if (s.equalsIgnoreCase(ColumnStatsType.DOUBLE.toString())) {
            DoubleColumnStatsDataInspector doubleStats = new DoubleColumnStatsDataInspector();
            statsData.setDoubleStats(doubleStats);
            statsObj.setStatsData(statsData);
        } else if (s.equalsIgnoreCase(ColumnStatsType.STRING.toString())) {
            StringColumnStatsDataInspector stringStats = new StringColumnStatsDataInspector();
            statsData.setStringStats(stringStats);
            statsObj.setStatsData(statsData);
        } else if (s.equalsIgnoreCase(ColumnStatsType.BOOLEAN.toString())) {
            BooleanColumnStatsData booleanStats = new BooleanColumnStatsData();
            statsData.setBooleanStats(booleanStats);
            statsObj.setStatsData(statsData);
        } else if (s.equalsIgnoreCase(ColumnStatsType.BINARY.toString())) {
            BinaryColumnStatsData binaryStats = new BinaryColumnStatsData();
            statsData.setBinaryStats(binaryStats);
            statsObj.setStatsData(statsData);
        } else if (s.equalsIgnoreCase(ColumnStatsType.DECIMAL.toString())) {
            DecimalColumnStatsDataInspector decimalStats = new DecimalColumnStatsDataInspector();
            statsData.setDecimalStats(decimalStats);
            statsObj.setStatsData(statsData);
        } else if (s.equalsIgnoreCase(ColumnStatsType.DATE.toString())) {
            DateColumnStatsDataInspector dateStats = new DateColumnStatsDataInspector();
            statsData.setDateStats(dateStats);
            statsObj.setStatsData(statsData);
        } else if (s.equalsIgnoreCase(ColumnStatsType.TIMESTAMP.toString())) {
            TimestampColumnStatsDataInspector timestampStats = new TimestampColumnStatsDataInspector();
            statsData.setTimestampStats(timestampStats);
            statsObj.setStatsData(statsData);
        }
    } else {
        // invoke the right unpack method depending on data type of the column
        if (statsObj.getStatsData().isSetBooleanStats()) {
            unpackBooleanStats(oi, o, csf, statsObj);
        } else if (statsObj.getStatsData().isSetLongStats()) {
            unpackLongStats(oi, o, csf, statsObj);
        } else if (statsObj.getStatsData().isSetDoubleStats()) {
            unpackDoubleStats(oi, o, csf, statsObj);
        } else if (statsObj.getStatsData().isSetStringStats()) {
            unpackStringStats(oi, o, csf, statsObj);
        } else if (statsObj.getStatsData().isSetBinaryStats()) {
            unpackBinaryStats(oi, o, csf, statsObj);
        } else if (statsObj.getStatsData().isSetDecimalStats()) {
            unpackDecimalStats(oi, o, csf, statsObj);
        } else if (statsObj.getStatsData().isSetDateStats()) {
            unpackDateStats(oi, o, csf, statsObj);
        } else if (statsObj.getStatsData().isSetTimestampStats()) {
            unpackTimestampStats(oi, o, csf, statsObj);
        }
    }
}
Also used : BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) DecimalColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector) DoubleColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector) LongColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) DateColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector) TimestampColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.TimestampColumnStatsDataInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) StringColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)

Example 9 with DateColumnStatsDataInspector

use of org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector in project hive by apache.

the class ColumnStatsAggregatorFactory method newColumnStaticsObj.

public static ColumnStatisticsObj newColumnStaticsObj(String colName, String colType, _Fields type) {
    ColumnStatisticsObj cso = new ColumnStatisticsObj();
    ColumnStatisticsData csd = new ColumnStatisticsData();
    cso.setColName(colName);
    cso.setColType(colType);
    switch(type) {
        case BOOLEAN_STATS:
            csd.setBooleanStats(new BooleanColumnStatsData());
            break;
        case LONG_STATS:
            csd.setLongStats(new LongColumnStatsDataInspector());
            break;
        case DATE_STATS:
            csd.setDateStats(new DateColumnStatsDataInspector());
            break;
        case TIMESTAMP_STATS:
            csd.setTimestampStats(new TimestampColumnStatsDataInspector());
            break;
        case DOUBLE_STATS:
            csd.setDoubleStats(new DoubleColumnStatsDataInspector());
            break;
        case STRING_STATS:
            csd.setStringStats(new StringColumnStatsDataInspector());
            break;
        case BINARY_STATS:
            csd.setBinaryStats(new BinaryColumnStatsData());
            break;
        case DECIMAL_STATS:
            csd.setDecimalStats(new DecimalColumnStatsDataInspector());
            break;
        default:
            throw new RuntimeException("Woh, bad.  Unknown stats type!");
    }
    cso.setStatsData(csd);
    return cso;
}
Also used : BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) DecimalColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector) DoubleColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector) LongColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector) DateColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector) TimestampColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.TimestampColumnStatsDataInspector) StringColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)

Example 10 with DateColumnStatsDataInspector

use of org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector in project hive by apache.

the class DateColumnStatsAggregator method extrapolate.

@Override
public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, int numPartsWithStats, Map<String, Double> adjustedIndexMap, Map<String, ColumnStatisticsData> adjustedStatsMap, double densityAvg) {
    int rightBorderInd = numParts;
    DateColumnStatsDataInspector extrapolateDateData = new DateColumnStatsDataInspector();
    Map<String, DateColumnStatsData> extractedAdjustedStatsMap = new HashMap<>();
    for (Map.Entry<String, ColumnStatisticsData> entry : adjustedStatsMap.entrySet()) {
        extractedAdjustedStatsMap.put(entry.getKey(), entry.getValue().getDateStats());
    }
    List<Map.Entry<String, DateColumnStatsData>> list = new LinkedList<>(extractedAdjustedStatsMap.entrySet());
    // get the lowValue
    Collections.sort(list, new Comparator<Map.Entry<String, DateColumnStatsData>>() {

        @Override
        public int compare(Map.Entry<String, DateColumnStatsData> o1, Map.Entry<String, DateColumnStatsData> o2) {
            return o1.getValue().getLowValue().compareTo(o2.getValue().getLowValue());
        }
    });
    double minInd = adjustedIndexMap.get(list.get(0).getKey());
    double maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
    long lowValue = 0;
    long min = list.get(0).getValue().getLowValue().getDaysSinceEpoch();
    long max = list.get(list.size() - 1).getValue().getLowValue().getDaysSinceEpoch();
    if (minInd == maxInd) {
        lowValue = min;
    } else if (minInd < maxInd) {
        // left border is the min
        lowValue = (long) (max - (max - min) * maxInd / (maxInd - minInd));
    } else {
        // right border is the min
        lowValue = (long) (max - (max - min) * (rightBorderInd - maxInd) / (minInd - maxInd));
    }
    // get the highValue
    Collections.sort(list, new Comparator<Map.Entry<String, DateColumnStatsData>>() {

        @Override
        public int compare(Map.Entry<String, DateColumnStatsData> o1, Map.Entry<String, DateColumnStatsData> o2) {
            return o1.getValue().getHighValue().compareTo(o2.getValue().getHighValue());
        }
    });
    minInd = adjustedIndexMap.get(list.get(0).getKey());
    maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
    long highValue = 0;
    min = list.get(0).getValue().getHighValue().getDaysSinceEpoch();
    max = list.get(list.size() - 1).getValue().getHighValue().getDaysSinceEpoch();
    if (minInd == maxInd) {
        highValue = min;
    } else if (minInd < maxInd) {
        // right border is the max
        highValue = (long) (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd));
    } else {
        // left border is the max
        highValue = (long) (min + (max - min) * minInd / (minInd - maxInd));
    }
    // get the #nulls
    long numNulls = 0;
    for (Map.Entry<String, DateColumnStatsData> entry : extractedAdjustedStatsMap.entrySet()) {
        numNulls += entry.getValue().getNumNulls();
    }
    // we scale up sumNulls based on the number of partitions
    numNulls = numNulls * numParts / numPartsWithStats;
    // get the ndv
    long ndv = 0;
    Collections.sort(list, new Comparator<Map.Entry<String, DateColumnStatsData>>() {

        @Override
        public int compare(Map.Entry<String, DateColumnStatsData> o1, Map.Entry<String, DateColumnStatsData> o2) {
            return Long.compare(o1.getValue().getNumDVs(), o2.getValue().getNumDVs());
        }
    });
    long lowerBound = list.get(list.size() - 1).getValue().getNumDVs();
    long higherBound = 0;
    for (Map.Entry<String, DateColumnStatsData> entry : list) {
        higherBound += entry.getValue().getNumDVs();
    }
    if (useDensityFunctionForNDVEstimation && densityAvg != 0.0) {
        ndv = (long) ((highValue - lowValue) / densityAvg);
        if (ndv < lowerBound) {
            ndv = lowerBound;
        } else if (ndv > higherBound) {
            ndv = higherBound;
        }
    } else {
        minInd = adjustedIndexMap.get(list.get(0).getKey());
        maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
        min = list.get(0).getValue().getNumDVs();
        max = list.get(list.size() - 1).getValue().getNumDVs();
        if (minInd == maxInd) {
            ndv = min;
        } else if (minInd < maxInd) {
            // right border is the max
            ndv = (long) (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd));
        } else {
            // left border is the max
            ndv = (long) (min + (max - min) * minInd / (minInd - maxInd));
        }
    }
    extrapolateDateData.setLowValue(new Date(lowValue));
    extrapolateDateData.setHighValue(new Date(highValue));
    extrapolateDateData.setNumNulls(numNulls);
    extrapolateDateData.setNumDVs(ndv);
    extrapolateData.setDateStats(extrapolateDateData);
}
Also used : DateColumnStatsData(org.apache.hadoop.hive.metastore.api.DateColumnStatsData) HashMap(java.util.HashMap) DateColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector) LinkedList(java.util.LinkedList) Date(org.apache.hadoop.hive.metastore.api.Date) HashMap(java.util.HashMap) Map(java.util.Map) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Aggregations

DateColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector)13 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)10 BinaryColumnStatsData (org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)9 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)9 DecimalColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector)9 DoubleColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector)9 LongColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector)9 StringColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector)9 TimestampColumnStatsDataInspector (org.apache.hadoop.hive.metastore.columnstats.cache.TimestampColumnStatsDataInspector)8 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)6 Date (org.apache.hadoop.hive.metastore.api.Date)5 Timestamp (org.apache.hadoop.hive.metastore.api.Timestamp)4 BigDecimal (java.math.BigDecimal)2 HashMap (java.util.HashMap)2 Map (java.util.Map)2 NumDistinctValueEstimator (org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator)2 DateColumnStatsData (org.apache.hadoop.hive.metastore.api.DateColumnStatsData)2 PrimitiveObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector)2 StringObjectInspector (org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector)2 LinkedList (java.util.LinkedList)1