Search in sources :

Example 46 with ColumnStatistics

use of org.apache.hadoop.hive.metastore.api.ColumnStatistics in project hive by apache.

the class DecimalColumnStatsAggregator method aggregate.

@Override
public ColumnStatisticsObj aggregate(String colName, List<String> partNames, List<ColumnStatistics> css) throws MetaException {
    ColumnStatisticsObj statsObj = null;
    // check if all the ColumnStatisticsObjs contain stats and all the ndv are
    // bitvectors
    boolean doAllPartitionContainStats = partNames.size() == css.size();
    boolean isNDVBitVectorSet = true;
    String colType = null;
    for (ColumnStatistics cs : css) {
        if (cs.getStatsObjSize() != 1) {
            throw new MetaException("The number of columns should be exactly one in aggrStats, but found " + cs.getStatsObjSize());
        }
        ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
        if (statsObj == null) {
            colType = cso.getColType();
            statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso.getStatsData().getSetField());
        }
        if (numBitVectors <= 0 || !cso.getStatsData().getDecimalStats().isSetBitVectors() || cso.getStatsData().getDecimalStats().getBitVectors().length() == 0) {
            isNDVBitVectorSet = false;
            break;
        }
    }
    ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
    if (doAllPartitionContainStats || css.size() < 2) {
        DecimalColumnStatsData aggregateData = null;
        long lowerBound = 0;
        long higherBound = 0;
        double densityAvgSum = 0.0;
        NumDistinctValueEstimator ndvEstimator = null;
        if (isNDVBitVectorSet) {
            ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
        }
        for (ColumnStatistics cs : css) {
            ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
            DecimalColumnStatsData newData = cso.getStatsData().getDecimalStats();
            if (useDensityFunctionForNDVEstimation) {
                lowerBound = Math.max(lowerBound, newData.getNumDVs());
                higherBound += newData.getNumDVs();
                densityAvgSum += (HBaseUtils.getDoubleValue(newData.getHighValue()) - HBaseUtils.getDoubleValue(newData.getLowValue())) / newData.getNumDVs();
            }
            if (isNDVBitVectorSet) {
                ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), ndvEstimator.getnumBitVectors()));
            }
            if (aggregateData == null) {
                aggregateData = newData.deepCopy();
            } else {
                if (HBaseUtils.getDoubleValue(aggregateData.getLowValue()) < HBaseUtils.getDoubleValue(newData.getLowValue())) {
                    aggregateData.setLowValue(aggregateData.getLowValue());
                } else {
                    aggregateData.setLowValue(newData.getLowValue());
                }
                if (HBaseUtils.getDoubleValue(aggregateData.getHighValue()) > HBaseUtils.getDoubleValue(newData.getHighValue())) {
                    aggregateData.setHighValue(aggregateData.getHighValue());
                } else {
                    aggregateData.setHighValue(newData.getHighValue());
                }
                aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
            }
        }
        if (isNDVBitVectorSet) {
            // if all the ColumnStatisticsObjs contain bitvectors, we do not need to
            // use uniform distribution assumption because we can merge bitvectors
            // to get a good estimation.
            aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
        } else {
            if (useDensityFunctionForNDVEstimation) {
                // We have estimation, lowerbound and higherbound. We use estimation
                // if it is between lowerbound and higherbound.
                double densityAvg = densityAvgSum / partNames.size();
                long estimation = (long) ((HBaseUtils.getDoubleValue(aggregateData.getHighValue()) - HBaseUtils.getDoubleValue(aggregateData.getLowValue())) / densityAvg);
                if (estimation < lowerBound) {
                    aggregateData.setNumDVs(lowerBound);
                } else if (estimation > higherBound) {
                    aggregateData.setNumDVs(higherBound);
                } else {
                    aggregateData.setNumDVs(estimation);
                }
            } else {
            // Without useDensityFunctionForNDVEstimation, we just use the
            // default one, which is the max of all the partitions and it is
            // already done.
            }
        }
        columnStatisticsData.setDecimalStats(aggregateData);
    } else {
        // we need extrapolation
        Map<String, Integer> indexMap = new HashMap<String, Integer>();
        for (int index = 0; index < partNames.size(); index++) {
            indexMap.put(partNames.get(index), index);
        }
        Map<String, Double> adjustedIndexMap = new HashMap<String, Double>();
        Map<String, ColumnStatisticsData> adjustedStatsMap = new HashMap<String, ColumnStatisticsData>();
        // while we scan the css, we also get the densityAvg, lowerbound and
        // higerbound when useDensityFunctionForNDVEstimation is true.
        double densityAvgSum = 0.0;
        if (!isNDVBitVectorSet) {
            // the traditional extrapolation methods.
            for (ColumnStatistics cs : css) {
                String partName = cs.getStatsDesc().getPartName();
                ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
                DecimalColumnStatsData newData = cso.getStatsData().getDecimalStats();
                if (useDensityFunctionForNDVEstimation) {
                    densityAvgSum += (HBaseUtils.getDoubleValue(newData.getHighValue()) - HBaseUtils.getDoubleValue(newData.getLowValue())) / newData.getNumDVs();
                }
                adjustedIndexMap.put(partName, (double) indexMap.get(partName));
                adjustedStatsMap.put(partName, cso.getStatsData());
            }
        } else {
            // we first merge all the adjacent bitvectors that we could merge and
            // derive new partition names and index.
            NumDistinctValueEstimator ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
            StringBuilder pseudoPartName = new StringBuilder();
            double pseudoIndexSum = 0;
            int length = 0;
            int curIndex = -1;
            DecimalColumnStatsData aggregateData = null;
            for (ColumnStatistics cs : css) {
                String partName = cs.getStatsDesc().getPartName();
                ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
                DecimalColumnStatsData newData = cso.getStatsData().getDecimalStats();
                // already checked it before.
                if (indexMap.get(partName) != curIndex) {
                    // There is bitvector, but it is not adjacent to the previous ones.
                    if (length > 0) {
                        // we have to set ndv
                        adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length);
                        aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
                        ColumnStatisticsData csd = new ColumnStatisticsData();
                        csd.setDecimalStats(aggregateData);
                        adjustedStatsMap.put(pseudoPartName.toString(), csd);
                        if (useDensityFunctionForNDVEstimation) {
                            densityAvgSum += (HBaseUtils.getDoubleValue(aggregateData.getHighValue()) - HBaseUtils.getDoubleValue(aggregateData.getLowValue())) / aggregateData.getNumDVs();
                        }
                        // reset everything
                        pseudoPartName = new StringBuilder();
                        pseudoIndexSum = 0;
                        length = 0;
                    }
                    aggregateData = null;
                }
                curIndex = indexMap.get(partName);
                pseudoPartName.append(partName);
                pseudoIndexSum += curIndex;
                length++;
                curIndex++;
                if (aggregateData == null) {
                    aggregateData = newData.deepCopy();
                } else {
                    if (HBaseUtils.getDoubleValue(aggregateData.getLowValue()) < HBaseUtils.getDoubleValue(newData.getLowValue())) {
                        aggregateData.setLowValue(aggregateData.getLowValue());
                    } else {
                        aggregateData.setLowValue(newData.getLowValue());
                    }
                    if (HBaseUtils.getDoubleValue(aggregateData.getHighValue()) > HBaseUtils.getDoubleValue(newData.getHighValue())) {
                        aggregateData.setHighValue(aggregateData.getHighValue());
                    } else {
                        aggregateData.setHighValue(newData.getHighValue());
                    }
                    aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                }
                ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), ndvEstimator.getnumBitVectors()));
            }
            if (length > 0) {
                // we have to set ndv
                adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length);
                aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
                ColumnStatisticsData csd = new ColumnStatisticsData();
                csd.setDecimalStats(aggregateData);
                adjustedStatsMap.put(pseudoPartName.toString(), csd);
                if (useDensityFunctionForNDVEstimation) {
                    densityAvgSum += (HBaseUtils.getDoubleValue(aggregateData.getHighValue()) - HBaseUtils.getDoubleValue(aggregateData.getLowValue())) / aggregateData.getNumDVs();
                }
            }
        }
        extrapolate(columnStatisticsData, partNames.size(), css.size(), adjustedIndexMap, adjustedStatsMap, densityAvgSum / adjustedStatsMap.size());
    }
    statsObj.setStatsData(columnStatisticsData);
    return statsObj;
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) HashMap(java.util.HashMap) NumDistinctValueEstimator(org.apache.hadoop.hive.metastore.NumDistinctValueEstimator) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) DecimalColumnStatsData(org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) MetaException(org.apache.hadoop.hive.metastore.api.MetaException)

Example 47 with ColumnStatistics

use of org.apache.hadoop.hive.metastore.api.ColumnStatistics in project hive by apache.

the class DoubleColumnStatsAggregator method aggregate.

@Override
public ColumnStatisticsObj aggregate(String colName, List<String> partNames, List<ColumnStatistics> css) throws MetaException {
    ColumnStatisticsObj statsObj = null;
    // check if all the ColumnStatisticsObjs contain stats and all the ndv are
    // bitvectors
    boolean doAllPartitionContainStats = partNames.size() == css.size();
    boolean isNDVBitVectorSet = true;
    String colType = null;
    for (ColumnStatistics cs : css) {
        if (cs.getStatsObjSize() != 1) {
            throw new MetaException("The number of columns should be exactly one in aggrStats, but found " + cs.getStatsObjSize());
        }
        ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
        if (statsObj == null) {
            colType = cso.getColType();
            statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso.getStatsData().getSetField());
        }
        if (numBitVectors <= 0 || !cso.getStatsData().getDoubleStats().isSetBitVectors() || cso.getStatsData().getDoubleStats().getBitVectors().length() == 0) {
            isNDVBitVectorSet = false;
            break;
        }
    }
    ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
    if (doAllPartitionContainStats || css.size() < 2) {
        DoubleColumnStatsData aggregateData = null;
        long lowerBound = 0;
        long higherBound = 0;
        double densityAvgSum = 0.0;
        NumDistinctValueEstimator ndvEstimator = null;
        if (isNDVBitVectorSet) {
            ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
        }
        for (ColumnStatistics cs : css) {
            ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
            DoubleColumnStatsData newData = cso.getStatsData().getDoubleStats();
            if (useDensityFunctionForNDVEstimation) {
                lowerBound = Math.max(lowerBound, newData.getNumDVs());
                higherBound += newData.getNumDVs();
                densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs();
            }
            if (isNDVBitVectorSet) {
                ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), ndvEstimator.getnumBitVectors()));
            }
            if (aggregateData == null) {
                aggregateData = newData.deepCopy();
            } else {
                aggregateData.setLowValue(Math.min(aggregateData.getLowValue(), newData.getLowValue()));
                aggregateData.setHighValue(Math.max(aggregateData.getHighValue(), newData.getHighValue()));
                aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
            }
        }
        if (isNDVBitVectorSet) {
            // if all the ColumnStatisticsObjs contain bitvectors, we do not need to
            // use uniform distribution assumption because we can merge bitvectors
            // to get a good estimation.
            aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
        } else {
            if (useDensityFunctionForNDVEstimation) {
                // We have estimation, lowerbound and higherbound. We use estimation
                // if it is between lowerbound and higherbound.
                double densityAvg = densityAvgSum / partNames.size();
                long estimation = (long) ((aggregateData.getHighValue() - aggregateData.getLowValue()) / densityAvg);
                if (estimation < lowerBound) {
                    aggregateData.setNumDVs(lowerBound);
                } else if (estimation > higherBound) {
                    aggregateData.setNumDVs(higherBound);
                } else {
                    aggregateData.setNumDVs(estimation);
                }
            } else {
            // Without useDensityFunctionForNDVEstimation, we just use the
            // default one, which is the max of all the partitions and it is
            // already done.
            }
        }
        columnStatisticsData.setDoubleStats(aggregateData);
    } else {
        // we need extrapolation
        Map<String, Integer> indexMap = new HashMap<String, Integer>();
        for (int index = 0; index < partNames.size(); index++) {
            indexMap.put(partNames.get(index), index);
        }
        Map<String, Double> adjustedIndexMap = new HashMap<String, Double>();
        Map<String, ColumnStatisticsData> adjustedStatsMap = new HashMap<String, ColumnStatisticsData>();
        // while we scan the css, we also get the densityAvg, lowerbound and
        // higerbound when useDensityFunctionForNDVEstimation is true.
        double densityAvgSum = 0.0;
        if (!isNDVBitVectorSet) {
            // the traditional extrapolation methods.
            for (ColumnStatistics cs : css) {
                String partName = cs.getStatsDesc().getPartName();
                ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
                DoubleColumnStatsData newData = cso.getStatsData().getDoubleStats();
                if (useDensityFunctionForNDVEstimation) {
                    densityAvgSum += (newData.getHighValue() - newData.getLowValue()) / newData.getNumDVs();
                }
                adjustedIndexMap.put(partName, (double) indexMap.get(partName));
                adjustedStatsMap.put(partName, cso.getStatsData());
            }
        } else {
            // we first merge all the adjacent bitvectors that we could merge and
            // derive new partition names and index.
            NumDistinctValueEstimator ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
            StringBuilder pseudoPartName = new StringBuilder();
            double pseudoIndexSum = 0;
            int length = 0;
            int curIndex = -1;
            DoubleColumnStatsData aggregateData = null;
            for (ColumnStatistics cs : css) {
                String partName = cs.getStatsDesc().getPartName();
                ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
                DoubleColumnStatsData newData = cso.getStatsData().getDoubleStats();
                // already checked it before.
                if (indexMap.get(partName) != curIndex) {
                    // There is bitvector, but it is not adjacent to the previous ones.
                    if (length > 0) {
                        // we have to set ndv
                        adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length);
                        aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
                        ColumnStatisticsData csd = new ColumnStatisticsData();
                        csd.setDoubleStats(aggregateData);
                        adjustedStatsMap.put(pseudoPartName.toString(), csd);
                        if (useDensityFunctionForNDVEstimation) {
                            densityAvgSum += (aggregateData.getHighValue() - aggregateData.getLowValue()) / aggregateData.getNumDVs();
                        }
                        // reset everything
                        pseudoPartName = new StringBuilder();
                        pseudoIndexSum = 0;
                        length = 0;
                    }
                    aggregateData = null;
                }
                curIndex = indexMap.get(partName);
                pseudoPartName.append(partName);
                pseudoIndexSum += curIndex;
                length++;
                curIndex++;
                if (aggregateData == null) {
                    aggregateData = newData.deepCopy();
                } else {
                    aggregateData.setLowValue(Math.min(aggregateData.getLowValue(), newData.getLowValue()));
                    aggregateData.setHighValue(Math.max(aggregateData.getHighValue(), newData.getHighValue()));
                    aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                }
                ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), ndvEstimator.getnumBitVectors()));
            }
            if (length > 0) {
                // we have to set ndv
                adjustedIndexMap.put(pseudoPartName.toString(), pseudoIndexSum / length);
                aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
                ColumnStatisticsData csd = new ColumnStatisticsData();
                csd.setDoubleStats(aggregateData);
                adjustedStatsMap.put(pseudoPartName.toString(), csd);
                if (useDensityFunctionForNDVEstimation) {
                    densityAvgSum += (aggregateData.getHighValue() - aggregateData.getLowValue()) / aggregateData.getNumDVs();
                }
            }
        }
        extrapolate(columnStatisticsData, partNames.size(), css.size(), adjustedIndexMap, adjustedStatsMap, densityAvgSum / adjustedStatsMap.size());
    }
    statsObj.setStatsData(columnStatisticsData);
    return statsObj;
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) HashMap(java.util.HashMap) NumDistinctValueEstimator(org.apache.hadoop.hive.metastore.NumDistinctValueEstimator) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) DoubleColumnStatsData(org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) MetaException(org.apache.hadoop.hive.metastore.api.MetaException)

Example 48 with ColumnStatistics

use of org.apache.hadoop.hive.metastore.api.ColumnStatistics in project hive by apache.

the class StringColumnStatsAggregator method aggregate.

@Override
public ColumnStatisticsObj aggregate(String colName, List<String> partNames, List<ColumnStatistics> css) throws MetaException {
    ColumnStatisticsObj statsObj = null;
    // check if all the ColumnStatisticsObjs contain stats and all the ndv are
    // bitvectors. Only when both of the conditions are true, we merge bit
    // vectors. Otherwise, just use the maximum function.
    boolean doAllPartitionContainStats = partNames.size() == css.size();
    boolean isNDVBitVectorSet = true;
    String colType = null;
    for (ColumnStatistics cs : css) {
        if (cs.getStatsObjSize() != 1) {
            throw new MetaException("The number of columns should be exactly one in aggrStats, but found " + cs.getStatsObjSize());
        }
        ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
        if (statsObj == null) {
            colType = cso.getColType();
            statsObj = ColumnStatsAggregatorFactory.newColumnStaticsObj(colName, colType, cso.getStatsData().getSetField());
        }
        if (numBitVectors <= 0 || !cso.getStatsData().getStringStats().isSetBitVectors() || cso.getStatsData().getStringStats().getBitVectors().length() == 0) {
            isNDVBitVectorSet = false;
            break;
        }
    }
    ColumnStatisticsData columnStatisticsData = new ColumnStatisticsData();
    if (doAllPartitionContainStats && isNDVBitVectorSet) {
        StringColumnStatsData aggregateData = null;
        NumDistinctValueEstimator ndvEstimator = new NumDistinctValueEstimator(numBitVectors);
        for (ColumnStatistics cs : css) {
            ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
            StringColumnStatsData newData = cso.getStatsData().getStringStats();
            ndvEstimator.mergeEstimators(new NumDistinctValueEstimator(newData.getBitVectors(), ndvEstimator.getnumBitVectors()));
            if (aggregateData == null) {
                aggregateData = newData.deepCopy();
            } else {
                aggregateData.setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen()));
                aggregateData.setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen()));
                aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
            }
        }
        aggregateData.setNumDVs(ndvEstimator.estimateNumDistinctValues());
        columnStatisticsData.setStringStats(aggregateData);
    } else {
        StringColumnStatsData aggregateData = null;
        for (ColumnStatistics cs : css) {
            ColumnStatisticsObj cso = cs.getStatsObjIterator().next();
            StringColumnStatsData newData = cso.getStatsData().getStringStats();
            if (aggregateData == null) {
                aggregateData = newData.deepCopy();
            } else {
                aggregateData.setMaxColLen(Math.max(aggregateData.getMaxColLen(), newData.getMaxColLen()));
                aggregateData.setAvgColLen(Math.max(aggregateData.getAvgColLen(), newData.getAvgColLen()));
                aggregateData.setNumNulls(aggregateData.getNumNulls() + newData.getNumNulls());
                aggregateData.setNumDVs(Math.max(aggregateData.getNumDVs(), newData.getNumDVs()));
            }
        }
        columnStatisticsData.setStringStats(aggregateData);
    }
    statsObj.setStatsData(columnStatisticsData);
    return statsObj;
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) StringColumnStatsData(org.apache.hadoop.hive.metastore.api.StringColumnStatsData) NumDistinctValueEstimator(org.apache.hadoop.hive.metastore.NumDistinctValueEstimator) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) MetaException(org.apache.hadoop.hive.metastore.api.MetaException)

Example 49 with ColumnStatistics

use of org.apache.hadoop.hive.metastore.api.ColumnStatistics in project hive by apache.

the class ColumnStatsTask method constructColumnStatsFromPackedRows.

private List<ColumnStatistics> constructColumnStatsFromPackedRows(Hive db) throws HiveException, MetaException, IOException {
    String currentDb = SessionState.get().getCurrentDatabase();
    String tableName = work.getColStats().getTableName();
    String partName = null;
    List<String> colName = work.getColStats().getColName();
    List<String> colType = work.getColStats().getColType();
    boolean isTblLevel = work.getColStats().isTblLevel();
    List<ColumnStatistics> stats = new ArrayList<ColumnStatistics>();
    InspectableObject packedRow;
    Table tbl = db.getTable(currentDb, tableName);
    while ((packedRow = ftOp.getNextRow()) != null) {
        if (packedRow.oi.getCategory() != ObjectInspector.Category.STRUCT) {
            throw new HiveException("Unexpected object type encountered while unpacking row");
        }
        List<ColumnStatisticsObj> statsObjs = new ArrayList<ColumnStatisticsObj>();
        StructObjectInspector soi = (StructObjectInspector) packedRow.oi;
        List<? extends StructField> fields = soi.getAllStructFieldRefs();
        List<Object> list = soi.getStructFieldsDataAsList(packedRow.o);
        List<FieldSchema> partColSchema = tbl.getPartCols();
        // Partition columns are appended at end, we only care about stats column
        int numOfStatCols = isTblLevel ? fields.size() : fields.size() - partColSchema.size();
        for (int i = 0; i < numOfStatCols; i++) {
            // Get the field objectInspector, fieldName and the field object.
            ObjectInspector foi = fields.get(i).getFieldObjectInspector();
            Object f = (list == null ? null : list.get(i));
            String fieldName = fields.get(i).getFieldName();
            ColumnStatisticsObj statsObj = new ColumnStatisticsObj();
            statsObj.setColName(colName.get(i));
            statsObj.setColType(colType.get(i));
            unpackStructObject(foi, f, fieldName, statsObj);
            statsObjs.add(statsObj);
        }
        if (!isTblLevel) {
            List<String> partVals = new ArrayList<String>();
            // Iterate over partition columns to figure out partition name
            for (int i = fields.size() - partColSchema.size(); i < fields.size(); i++) {
                Object partVal = ((PrimitiveObjectInspector) fields.get(i).getFieldObjectInspector()).getPrimitiveJavaObject(list.get(i));
                partVals.add(// could be null for default partition
                partVal == null ? this.conf.getVar(ConfVars.DEFAULTPARTITIONNAME) : partVal.toString());
            }
            partName = Warehouse.makePartName(partColSchema, partVals);
        }
        String[] names = Utilities.getDbTableName(currentDb, tableName);
        ColumnStatisticsDesc statsDesc = getColumnStatsDesc(names[0], names[1], partName, isTblLevel);
        ColumnStatistics colStats = new ColumnStatistics();
        colStats.setStatsDesc(statsDesc);
        colStats.setStatsObj(statsObjs);
        stats.add(colStats);
    }
    ftOp.clearFetchContext();
    return stats;
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) HiveDecimalObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.HiveDecimalObjectInspector) ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector) StringObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.StringObjectInspector) DateObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.DateObjectInspector) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) LongObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.LongObjectInspector) DoubleObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.primitive.DoubleObjectInspector) Table(org.apache.hadoop.hive.ql.metadata.Table) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ArrayList(java.util.ArrayList) InspectableObject(org.apache.hadoop.hive.serde2.objectinspector.InspectableObject) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) InspectableObject(org.apache.hadoop.hive.serde2.objectinspector.InspectableObject) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 50 with ColumnStatistics

use of org.apache.hadoop.hive.metastore.api.ColumnStatistics in project hive by apache.

the class ColumnStatsUpdateTask method persistColumnStats.

private int persistColumnStats(Hive db) throws HiveException, MetaException, IOException {
    List<ColumnStatistics> colStats = new ArrayList<>();
    colStats.add(constructColumnStatsFromInput());
    SetPartitionsStatsRequest request = new SetPartitionsStatsRequest(colStats);
    db.setPartitionColumnStatistics(request);
    return 0;
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) ArrayList(java.util.ArrayList) SetPartitionsStatsRequest(org.apache.hadoop.hive.metastore.api.SetPartitionsStatsRequest)

Aggregations

ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)90 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)75 ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)67 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)62 ArrayList (java.util.ArrayList)61 Test (org.junit.Test)53 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)40 Table (org.apache.hadoop.hive.metastore.api.Table)38 Partition (org.apache.hadoop.hive.metastore.api.Partition)33 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)32 SerDeInfo (org.apache.hadoop.hive.metastore.api.SerDeInfo)31 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)29 List (java.util.List)26 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)19 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)14 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)13 DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)12 StringColumnStatsData (org.apache.hadoop.hive.metastore.api.StringColumnStatsData)12 HashMap (java.util.HashMap)11 Database (org.apache.hadoop.hive.metastore.api.Database)9