Search in sources :

Example 46 with ColumnStatisticsData

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData in project hive by apache.

the class TestHiveMetaStore method testColumnStatistics.

@Test
public void testColumnStatistics() throws Throwable {
    String dbName = "columnstatstestdb";
    String tblName = "tbl";
    String typeName = "Person";
    String tblOwner = "testowner";
    int lastAccessed = 6796;
    try {
        cleanUp(dbName, tblName, typeName);
        Database db = new Database();
        db.setName(dbName);
        client.createDatabase(db);
        createTableForTestFilter(dbName, tblName, tblOwner, lastAccessed, true);
        // Create a ColumnStatistics Obj
        String[] colName = new String[] { "income", "name" };
        double lowValue = 50000.21;
        double highValue = 1200000.4525;
        long numNulls = 3;
        long numDVs = 22;
        double avgColLen = 50.30;
        long maxColLen = 102;
        String[] colType = new String[] { "double", "string" };
        boolean isTblLevel = true;
        String partName = null;
        List<ColumnStatisticsObj> statsObjs = new ArrayList<>();
        ColumnStatisticsDesc statsDesc = new ColumnStatisticsDesc();
        statsDesc.setDbName(dbName);
        statsDesc.setTableName(tblName);
        statsDesc.setIsTblLevel(isTblLevel);
        statsDesc.setPartName(partName);
        ColumnStatisticsObj statsObj = new ColumnStatisticsObj();
        statsObj.setColName(colName[0]);
        statsObj.setColType(colType[0]);
        ColumnStatisticsData statsData = new ColumnStatisticsData();
        DoubleColumnStatsData numericStats = new DoubleColumnStatsData();
        statsData.setDoubleStats(numericStats);
        statsData.getDoubleStats().setHighValue(highValue);
        statsData.getDoubleStats().setLowValue(lowValue);
        statsData.getDoubleStats().setNumDVs(numDVs);
        statsData.getDoubleStats().setNumNulls(numNulls);
        statsObj.setStatsData(statsData);
        statsObjs.add(statsObj);
        statsObj = new ColumnStatisticsObj();
        statsObj.setColName(colName[1]);
        statsObj.setColType(colType[1]);
        statsData = new ColumnStatisticsData();
        StringColumnStatsData stringStats = new StringColumnStatsData();
        statsData.setStringStats(stringStats);
        statsData.getStringStats().setAvgColLen(avgColLen);
        statsData.getStringStats().setMaxColLen(maxColLen);
        statsData.getStringStats().setNumDVs(numDVs);
        statsData.getStringStats().setNumNulls(numNulls);
        statsObj.setStatsData(statsData);
        statsObjs.add(statsObj);
        ColumnStatistics colStats = new ColumnStatistics();
        colStats.setStatsDesc(statsDesc);
        colStats.setStatsObj(statsObjs);
        // write stats objs persistently
        client.updateTableColumnStatistics(colStats);
        // retrieve the stats obj that was just written
        ColumnStatisticsObj colStats2 = client.getTableColumnStatistics(dbName, tblName, Lists.newArrayList(colName[0])).get(0);
        // compare stats obj to ensure what we get is what we wrote
        assertNotNull(colStats2);
        assertEquals(colStats2.getColName(), colName[0]);
        assertEquals(colStats2.getStatsData().getDoubleStats().getLowValue(), lowValue, 0.01);
        assertEquals(colStats2.getStatsData().getDoubleStats().getHighValue(), highValue, 0.01);
        assertEquals(colStats2.getStatsData().getDoubleStats().getNumNulls(), numNulls);
        assertEquals(colStats2.getStatsData().getDoubleStats().getNumDVs(), numDVs);
        // test delete column stats; if no col name is passed all column stats associated with the
        // table is deleted
        boolean status = client.deleteTableColumnStatistics(dbName, tblName, null);
        assertTrue(status);
        // try to query stats for a column for which stats doesn't exist
        assertTrue(client.getTableColumnStatistics(dbName, tblName, Lists.newArrayList(colName[1])).isEmpty());
        colStats.setStatsDesc(statsDesc);
        colStats.setStatsObj(statsObjs);
        // update table level column stats
        client.updateTableColumnStatistics(colStats);
        // query column stats for column whose stats were updated in the previous call
        colStats2 = client.getTableColumnStatistics(dbName, tblName, Lists.newArrayList(colName[0])).get(0);
        // partition level column statistics test
        // create a table with multiple partitions
        cleanUp(dbName, tblName, typeName);
        List<List<String>> values = new ArrayList<>();
        values.add(makeVals("2008-07-01 14:13:12", "14"));
        values.add(makeVals("2008-07-01 14:13:12", "15"));
        values.add(makeVals("2008-07-02 14:13:12", "15"));
        values.add(makeVals("2008-07-03 14:13:12", "151"));
        createMultiPartitionTableSchema(dbName, tblName, typeName, values);
        List<String> partitions = client.listPartitionNames(dbName, tblName, (short) -1);
        partName = partitions.get(0);
        isTblLevel = false;
        // create a new columnstatistics desc to represent partition level column stats
        statsDesc = new ColumnStatisticsDesc();
        statsDesc.setDbName(dbName);
        statsDesc.setTableName(tblName);
        statsDesc.setPartName(partName);
        statsDesc.setIsTblLevel(isTblLevel);
        colStats = new ColumnStatistics();
        colStats.setStatsDesc(statsDesc);
        colStats.setStatsObj(statsObjs);
        client.updatePartitionColumnStatistics(colStats);
        colStats2 = client.getPartitionColumnStatistics(dbName, tblName, Lists.newArrayList(partName), Lists.newArrayList(colName[1])).get(partName).get(0);
        // compare stats obj to ensure what we get is what we wrote
        assertNotNull(colStats2);
        assertEquals(colStats.getStatsDesc().getPartName(), partName);
        assertEquals(colStats2.getColName(), colName[1]);
        assertEquals(colStats2.getStatsData().getStringStats().getMaxColLen(), maxColLen);
        assertEquals(colStats2.getStatsData().getStringStats().getAvgColLen(), avgColLen, 0.01);
        assertEquals(colStats2.getStatsData().getStringStats().getNumNulls(), numNulls);
        assertEquals(colStats2.getStatsData().getStringStats().getNumDVs(), numDVs);
        // test stats deletion at partition level
        client.deletePartitionColumnStatistics(dbName, tblName, partName, colName[1]);
        colStats2 = client.getPartitionColumnStatistics(dbName, tblName, Lists.newArrayList(partName), Lists.newArrayList(colName[0])).get(partName).get(0);
        // test get stats on a column for which stats doesn't exist
        assertTrue(client.getPartitionColumnStatistics(dbName, tblName, Lists.newArrayList(partName), Lists.newArrayList(colName[1])).isEmpty());
    } catch (Exception e) {
        System.err.println(StringUtils.stringifyException(e));
        System.err.println("testColumnStatistics() failed.");
        throw e;
    } finally {
        cleanUp(dbName, tblName, typeName);
    }
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) ArrayList(java.util.ArrayList) StringColumnStatsData(org.apache.hadoop.hive.metastore.api.StringColumnStatsData) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) InvalidOperationException(org.apache.hadoop.hive.metastore.api.InvalidOperationException) ConfigValSecurityException(org.apache.hadoop.hive.metastore.api.ConfigValSecurityException) SQLException(java.sql.SQLException) UnknownDBException(org.apache.hadoop.hive.metastore.api.UnknownDBException) TException(org.apache.thrift.TException) IOException(java.io.IOException) InvalidObjectException(org.apache.hadoop.hive.metastore.api.InvalidObjectException) NoSuchObjectException(org.apache.hadoop.hive.metastore.api.NoSuchObjectException) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) DoubleColumnStatsData(org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) Database(org.apache.hadoop.hive.metastore.api.Database) List(java.util.List) ArrayList(java.util.ArrayList) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) Test(org.junit.Test)

Example 47 with ColumnStatisticsData

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData in project hive by apache.

the class BooleanColumnStatsAggregator method aggregate.

@Override
public ColumnStatisticsObj aggregate(List<ColStatsObjWithSourceInfo> colStatsWithSourceInfo, List<String> partNames, boolean areAllPartsFound) throws MetaException {
    ColumnStatisticsObj statsObj = null;
    String colType = null;
    String colName = null;
    BooleanColumnStatsData aggregateData = null;
    for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
        ColumnStatisticsObj cso = csp.getColStatsObj();
        if (statsObj == null) {
            colName = cso.getColName();
            colType = cso.getColType();
            statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso.getStatsData().getSetField());
        }
        BooleanColumnStatsData newData = cso.getStatsData().getBooleanStats();
        if (aggregateData == null) {
            aggregateData = newData.deepCopy();
        } else {
            aggregateData.setNumTrues(aggregateData.getNumTrues() + newData.getNumTrues());
            aggregateData.setNumFalses(aggregateData.getNumFalses() + newData.getNumFalses());
            aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
        }
    }
    ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
    columnStatisticsData.setBooleanStats(aggregateData);
    statsObj.setStatsData(columnStatisticsData);
    return statsObj;
}
Also used : BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColStatsObjWithSourceInfo(org.apache.hadoop.hive.metastore.utils.MetaStoreUtils.ColStatsObjWithSourceInfo) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Example 48 with ColumnStatisticsData

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData in project hive by apache.

the class DateColumnStatsAggregator method aggregate.

@Override
public ColumnStatisticsObj aggregate(List<ColStatsObjWithSourceInfo> colStatsWithSourceInfo, List<String> partNames, boolean areAllPartsFound) throws MetaException {
    ColumnStatisticsObj statsObj = null;
    String colType = null;
    String colName = null;
    // check if all the ColumnStatisticsObjs contain stats and all the ndv are
    // bitvectors
    boolean doAllPartitionContainStats = partNames.size() == colStatsWithSourceInfo.size();
    NumDistinctValueEstimator ndvEstimator = null;
    for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
        ColumnStatisticsObj cso = csp.getColStatsObj();
        if (statsObj == null) {
            colName = cso.getColName();
            colType = cso.getColType();
            statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso.getStatsData().getSetField());
            LOG.trace("doAllPartitionContainStats for column: {} is: {}", colName, doAllPartitionContainStats);
        }
        DateColumnStatsDataInspector dateColumnStats = (DateColumnStatsDataInspector) cso.getStatsData().getDateStats();
        if (dateColumnStats.getNdvEstimator() == null) {
            ndvEstimator = null;
            break;
        } else {
            // check if all of the bit vectors can merge
            NumDistinctValueEstimator estimator = dateColumnStats.getNdvEstimator();
            if (ndvEstimator == null) {
                ndvEstimator = estimator;
            } else {
                if (ndvEstimator.canMerge(estimator)) {
                    continue;
                } else {
                    ndvEstimator = null;
                    break;
                }
            }
        }
    }
    if (ndvEstimator != null) {
        ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator);
    }
    LOG.debug("all of the bit vectors can merge for " + colName + " is " + (ndvEstimator != null));
    ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
    if (doAllPartitionContainStats || colStatsWithSourceInfo.size() < 2) {
        DateColumnStatsDataInspector aggregateData = null;
        long lowerBound = 0;
        long higherBound = 0;
        double densityAvgSum = 0.0;
        for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
            ColumnStatisticsObj cso = csp.getColStatsObj();
            DateColumnStatsDataInspector newData = (DateColumnStatsDataInspector) cso.getStatsData().getDateStats();
            lowerBound = Math.max(lowerBound, newData.getNumDVs());
            higherBound += newData.getNumDVs();
            densityAvgSum += (diff(newData.getHighValue(), newData.getLowValue())) / newData.getNumDVs();
            if (ndvEstimator != null) {
                ndvEstimator.mergeEstimators(newData.getNdvEstimator());
            }
            if (aggregateData == null) {
                aggregateData = newData.deepCopy();
            } else {
                aggregateData.setLowValue(min(aggregateData.getLowValue(), newData.getLowValue()));
                aggregateData.setHighValue(max(aggregateData.getHighValue(), newData.getHighValue()));
                aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
            }
        }
        if (ndvEstimator != null) {
            // if all the ColumnStatisticsObjs contain bitvectors, we do not need to
            // use uniform distribution assumption because we can merge bitvectors
            // to get a good estimation.
            aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
        } else {
            long estimation;
            if (useDensityFunctionForNDVEstimation) {
                // We have estimation, lowerbound and higherbound. We use estimation
                // if it is between lowerbound and higherbound.
                double densityAvg = densityAvgSum / partNames.size();
                estimation = (long) (diff(aggregateData.getHighValue(), aggregateData.getLowValue()) / densityAvg);
                if (estimation < lowerBound) {
                    estimation = lowerBound;
                } else if (estimation > higherBound) {
                    estimation = higherBound;
                }
            } else {
                estimation = (long) (lowerBound + (higherBound - lowerBound) * ndvTuner);
            }
            aggregateData.setNumDVs(estimation);
        }
        columnStatisticsData.setDateStats(aggregateData);
    } else {
        // we need extrapolation
        LOG.debug("start extrapolation for " + colName);
        Map<String, Integer> indexMap = new HashMap<>();
        for (int index = 0; index < partNames.size(); index++) {
            indexMap.put(partNames.get(index), index);
        }
        Map<String, Double> adjustedIndexMap = new HashMap<>();
        Map<String, ColumnStatisticsData> adjustedStatsMap = new HashMap<>();
        // while we scan the css, we also get the densityAvg, lowerbound and
        // higerbound when useDensityFunctionForNDVEstimation is true.
        double densityAvgSum = 0.0;
        if (ndvEstimator == null) {
            // the traditional extrapolation methods.
            for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
                ColumnStatisticsObj cso = csp.getColStatsObj();
                String partName = csp.getPartName();
                DateColumnStatsData newData = cso.getStatsData().getDateStats();
                if (useDensityFunctionForNDVEstimation) {
                    densityAvgSum += diff(newData.getHighValue(), newData.getLowValue()) / newData.getNumDVs();
                }
                adjustedIndexMap.put(partName, (double) indexMap.get(partName));
                adjustedStatsMap.put(partName, cso.getStatsData());
            }
        } else {
            // we first merge all the adjacent bitvectors that we could merge and
            // derive new partition names and index.
            StringBuilder pseudoPartName = new StringBuilder();
            double pseudoIndexSum = 0;
            int length = 0;
            int curIndex = -1;
            DateColumnStatsDataInspector aggregateData = null;
            for (ColStatsObjWithSourceInfo csp : colStatsWithSourceInfo) {
                ColumnStatisticsObj cso = csp.getColStatsObj();
                String partName = csp.getPartName();
                DateColumnStatsDataInspector newData = (DateColumnStatsDataInspector) cso.getStatsData().getDateStats();
                // already checked it before.
                if (indexMap.get(partName) != curIndex) {
                    // There is bitvector, but it is not adjacent to the previous ones.
                    if (length > 0) {
                        // we have to set ndv
                        adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length);
                        aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
                        ColumnStatisticsData csd = new ColumnStatisticsData();
                        csd.setDateStats(aggregateData);
                        adjustedStatsMap.put(pseudoPartName.toString(), csd);
                        if (useDensityFunctionForNDVEstimation) {
                            densityAvgSum += diff(aggregateData.getHighValue(), aggregateData.getLowValue()) / aggregateData.getNumDVs();
                        }
                        // reset everything
                        pseudoPartName = new StringBuilder();
                        pseudoIndexSum = 0;
                        length = 0;
                        ndvEstimator = NumDistinctValueEstimatorFactory.getEmptyNumDistinctValueEstimator(ndvEstimator);
                    }
                    aggregateData = null;
                }
                curIndex = indexMap.get(partName);
                pseudoPartName.append(partName);
                pseudoIndexSum += curIndex;
                length++;
                curIndex++;
                if (aggregateData == null) {
                    aggregateData = newData.deepCopy();
                } else {
                    aggregateData.setLowValue(min(aggregateData.getLowValue(), newData.getLowValue()));
                    aggregateData.setHighValue(max(aggregateData.getHighValue(), newData.getHighValue()));
                    aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                }
                ndvEstimator.mergeEstimators(newData.getNdvEstimator());
            }
            if (length > 0) {
                // we have to set ndv
                adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length);
                aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
                ColumnStatisticsData csd = new ColumnStatisticsData();
                csd.setDateStats(aggregateData);
                adjustedStatsMap.put(pseudoPartName.toString(), csd);
                if (useDensityFunctionForNDVEstimation) {
                    densityAvgSum += diff(aggregateData.getHighValue(), aggregateData.getLowValue()) / aggregateData.getNumDVs();
                }
            }
        }
        extrapolate(columnStatisticsData, partNames.size(), colStatsWithSourceInfo.size(), adjustedIndexMap, adjustedStatsMap, densityAvgSum / adjustedStatsMap.size());
    }
    LOG.debug("Ndv estimatation for {} is {} # of partitions requested: {} # of partitions found: {}", colName, columnStatisticsData.getDateStats().getNumDVs(), partNames.size(), colStatsWithSourceInfo.size());
    statsObj.setStatsData(columnStatisticsData);
    return statsObj;
}
Also used : ColStatsObjWithSourceInfo(org.apache.hadoop.hive.metastore.utils.MetaStoreUtils.ColStatsObjWithSourceInfo) DateColumnStatsData(org.apache.hadoop.hive.metastore.api.DateColumnStatsData) HashMap(java.util.HashMap) DateColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector) NumDistinctValueEstimator(org.apache.hadoop.hive.common.ndv.NumDistinctValueEstimator) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Example 49 with ColumnStatisticsData

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData in project hive by apache.

the class DecimalColumnStatsAggregator method extrapolate.

@Override
public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, int numPartsWithStats, Map<String, Double> adjustedIndexMap, Map<String, ColumnStatisticsData> adjustedStatsMap, double densityAvg) {
    int rightBorderInd = numParts;
    DecimalColumnStatsDataInspector extrapolateDecimalData = new DecimalColumnStatsDataInspector();
    Map<String, DecimalColumnStatsData> extractedAdjustedStatsMap = new HashMap<>();
    for (Map.Entry<String, ColumnStatisticsData> entry : adjustedStatsMap.entrySet()) {
        extractedAdjustedStatsMap.put(entry.getKey(), entry.getValue().getDecimalStats());
    }
    List<Map.Entry<String, DecimalColumnStatsData>> list = new LinkedList<>(extractedAdjustedStatsMap.entrySet());
    // get the lowValue
    Collections.sort(list, new Comparator<Map.Entry<String, DecimalColumnStatsData>>() {

        @Override
        public int compare(Map.Entry<String, DecimalColumnStatsData> o1, Map.Entry<String, DecimalColumnStatsData> o2) {
            return o1.getValue().getLowValue().compareTo(o2.getValue().getLowValue());
        }
    });
    double minInd = adjustedIndexMap.get(list.get(0).getKey());
    double maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
    double lowValue = 0;
    double min = MetaStoreUtils.decimalToDouble(list.get(0).getValue().getLowValue());
    double max = MetaStoreUtils.decimalToDouble(list.get(list.size() - 1).getValue().getLowValue());
    if (minInd == maxInd) {
        lowValue = min;
    } else if (minInd < maxInd) {
        // left border is the min
        lowValue = (max - (max - min) * maxInd / (maxInd - minInd));
    } else {
        // right border is the min
        lowValue = (max - (max - min) * (rightBorderInd - maxInd) / (minInd - maxInd));
    }
    // get the highValue
    Collections.sort(list, new Comparator<Map.Entry<String, DecimalColumnStatsData>>() {

        @Override
        public int compare(Map.Entry<String, DecimalColumnStatsData> o1, Map.Entry<String, DecimalColumnStatsData> o2) {
            return o1.getValue().getHighValue().compareTo(o2.getValue().getHighValue());
        }
    });
    minInd = adjustedIndexMap.get(list.get(0).getKey());
    maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
    double highValue = 0;
    min = MetaStoreUtils.decimalToDouble(list.get(0).getValue().getHighValue());
    max = MetaStoreUtils.decimalToDouble(list.get(list.size() - 1).getValue().getHighValue());
    if (minInd == maxInd) {
        highValue = min;
    } else if (minInd < maxInd) {
        // right border is the max
        highValue = (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd));
    } else {
        // left border is the max
        highValue = (min + (max - min) * minInd / (minInd - maxInd));
    }
    // get the #nulls
    long numNulls = 0;
    for (Map.Entry<String, DecimalColumnStatsData> entry : extractedAdjustedStatsMap.entrySet()) {
        numNulls += entry.getValue().getNumNulls();
    }
    // we scale up sumNulls based on the number of partitions
    numNulls = numNulls * numParts / numPartsWithStats;
    // get the ndv
    long ndv = 0;
    long ndvMin = 0;
    long ndvMax = 0;
    Collections.sort(list, new Comparator<Map.Entry<String, DecimalColumnStatsData>>() {

        @Override
        public int compare(Map.Entry<String, DecimalColumnStatsData> o1, Map.Entry<String, DecimalColumnStatsData> o2) {
            return Long.compare(o1.getValue().getNumDVs(), o2.getValue().getNumDVs());
        }
    });
    long lowerBound = list.get(list.size() - 1).getValue().getNumDVs();
    long higherBound = 0;
    for (Map.Entry<String, DecimalColumnStatsData> entry : list) {
        higherBound += entry.getValue().getNumDVs();
    }
    if (useDensityFunctionForNDVEstimation && densityAvg != 0.0) {
        ndv = (long) ((highValue - lowValue) / densityAvg);
        if (ndv < lowerBound) {
            ndv = lowerBound;
        } else if (ndv > higherBound) {
            ndv = higherBound;
        }
    } else {
        minInd = adjustedIndexMap.get(list.get(0).getKey());
        maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
        ndvMin = list.get(0).getValue().getNumDVs();
        ndvMax = list.get(list.size() - 1).getValue().getNumDVs();
        if (minInd == maxInd) {
            ndv = ndvMin;
        } else if (minInd < maxInd) {
            // right border is the max
            ndv = (long) (ndvMin + (ndvMax - ndvMin) * (rightBorderInd - minInd) / (maxInd - minInd));
        } else {
            // left border is the max
            ndv = (long) (ndvMin + (ndvMax - ndvMin) * minInd / (minInd - maxInd));
        }
    }
    extrapolateDecimalData.setLowValue(StatObjectConverter.createThriftDecimal(String.valueOf(lowValue)));
    extrapolateDecimalData.setHighValue(StatObjectConverter.createThriftDecimal(String.valueOf(highValue)));
    extrapolateDecimalData.setNumNulls(numNulls);
    extrapolateDecimalData.setNumDVs(ndv);
    extrapolateData.setDecimalStats(extrapolateDecimalData);
}
Also used : HashMap(java.util.HashMap) LinkedList(java.util.LinkedList) DecimalColumnStatsData(org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData) DecimalColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector) HashMap(java.util.HashMap) Map(java.util.Map) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Example 50 with ColumnStatisticsData

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsData in project hive by apache.

the class StringColumnStatsAggregator method extrapolate.

@Override
public void extrapolate(ColumnStatisticsData extrapolateData, int numParts, int numPartsWithStats, Map<String, Double> adjustedIndexMap, Map<String, ColumnStatisticsData> adjustedStatsMap, double densityAvg) {
    int rightBorderInd = numParts;
    StringColumnStatsDataInspector extrapolateStringData = new StringColumnStatsDataInspector();
    Map<String, StringColumnStatsData> extractedAdjustedStatsMap = new HashMap<>();
    for (Map.Entry<String, ColumnStatisticsData> entry : adjustedStatsMap.entrySet()) {
        extractedAdjustedStatsMap.put(entry.getKey(), entry.getValue().getStringStats());
    }
    List<Map.Entry<String, StringColumnStatsData>> list = new LinkedList<>(extractedAdjustedStatsMap.entrySet());
    // get the avgLen
    Collections.sort(list, new Comparator<Map.Entry<String, StringColumnStatsData>>() {

        @Override
        public int compare(Map.Entry<String, StringColumnStatsData> o1, Map.Entry<String, StringColumnStatsData> o2) {
            return Double.compare(o1.getValue().getAvgColLen(), o2.getValue().getAvgColLen());
        }
    });
    double minInd = adjustedIndexMap.get(list.get(0).getKey());
    double maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
    double avgColLen = 0;
    double min = list.get(0).getValue().getAvgColLen();
    double max = list.get(list.size() - 1).getValue().getAvgColLen();
    if (minInd == maxInd) {
        avgColLen = min;
    } else if (minInd < maxInd) {
        // right border is the max
        avgColLen = (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd));
    } else {
        // left border is the max
        avgColLen = (min + (max - min) * minInd / (minInd - maxInd));
    }
    // get the maxLen
    Collections.sort(list, new Comparator<Map.Entry<String, StringColumnStatsData>>() {

        @Override
        public int compare(Map.Entry<String, StringColumnStatsData> o1, Map.Entry<String, StringColumnStatsData> o2) {
            return Long.compare(o1.getValue().getMaxColLen(), o2.getValue().getMaxColLen());
        }
    });
    minInd = adjustedIndexMap.get(list.get(0).getKey());
    maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
    double maxColLen = 0;
    min = list.get(0).getValue().getAvgColLen();
    max = list.get(list.size() - 1).getValue().getAvgColLen();
    if (minInd == maxInd) {
        maxColLen = min;
    } else if (minInd < maxInd) {
        // right border is the max
        maxColLen = (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd));
    } else {
        // left border is the max
        maxColLen = (min + (max - min) * minInd / (minInd - maxInd));
    }
    // get the #nulls
    long numNulls = 0;
    for (Map.Entry<String, StringColumnStatsData> entry : extractedAdjustedStatsMap.entrySet()) {
        numNulls += entry.getValue().getNumNulls();
    }
    // we scale up sumNulls based on the number of partitions
    numNulls = numNulls * numParts / numPartsWithStats;
    // get the ndv
    long ndv = 0;
    Collections.sort(list, new Comparator<Map.Entry<String, StringColumnStatsData>>() {

        @Override
        public int compare(Map.Entry<String, StringColumnStatsData> o1, Map.Entry<String, StringColumnStatsData> o2) {
            return Long.compare(o1.getValue().getNumDVs(), o2.getValue().getNumDVs());
        }
    });
    minInd = adjustedIndexMap.get(list.get(0).getKey());
    maxInd = adjustedIndexMap.get(list.get(list.size() - 1).getKey());
    min = list.get(0).getValue().getNumDVs();
    max = list.get(list.size() - 1).getValue().getNumDVs();
    if (minInd == maxInd) {
        ndv = (long) min;
    } else if (minInd < maxInd) {
        // right border is the max
        ndv = (long) (min + (max - min) * (rightBorderInd - minInd) / (maxInd - minInd));
    } else {
        // left border is the max
        ndv = (long) (min + (max - min) * minInd / (minInd - maxInd));
    }
    extrapolateStringData.setAvgColLen(avgColLen);
    extrapolateStringData.setMaxColLen((long) maxColLen);
    extrapolateStringData.setNumNulls(numNulls);
    extrapolateStringData.setNumDVs(ndv);
    extrapolateData.setStringStats(extrapolateStringData);
}
Also used : HashMap(java.util.HashMap) StringColumnStatsData(org.apache.hadoop.hive.metastore.api.StringColumnStatsData) LinkedList(java.util.LinkedList) StringColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector) HashMap(java.util.HashMap) Map(java.util.Map) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Aggregations

ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)108 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)95 ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)62 ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)56 Test (org.junit.Test)53 ArrayList (java.util.ArrayList)47 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)35 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)34 Table (org.apache.hadoop.hive.metastore.api.Table)33 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)32 SerDeInfo (org.apache.hadoop.hive.metastore.api.SerDeInfo)31 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)30 Partition (org.apache.hadoop.hive.metastore.api.Partition)30 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)29 DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)27 StringColumnStatsData (org.apache.hadoop.hive.metastore.api.StringColumnStatsData)25 DecimalColumnStatsData (org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData)23 BinaryColumnStatsData (org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)22 HashMap (java.util.HashMap)20 List (java.util.List)18