Search in sources :

Example 81 with ColumnStatisticsObj

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.

the class StatObjectConverter method getPartitionColumnStatisticsObj.

public static ColumnStatisticsObj getPartitionColumnStatisticsObj(MPartitionColumnStatistics mStatsObj, boolean enableBitVector) {
    ColumnStatisticsObj statsObj = new ColumnStatisticsObj();
    statsObj.setColType(mStatsObj.getColType());
    statsObj.setColName(mStatsObj.getColName());
    String colType = mStatsObj.getColType().toLowerCase();
    ColumnStatisticsData colStatsData = new ColumnStatisticsData();
    if (colType.equals("boolean")) {
        BooleanColumnStatsData boolStats = new BooleanColumnStatsData();
        boolStats.setNumFalses(mStatsObj.getNumFalses());
        boolStats.setNumTrues(mStatsObj.getNumTrues());
        boolStats.setNumNulls(mStatsObj.getNumNulls());
        colStatsData.setBooleanStats(boolStats);
    } else if (colType.equals("string") || colType.startsWith("varchar") || colType.startsWith("char")) {
        StringColumnStatsDataInspector stringStats = new StringColumnStatsDataInspector();
        stringStats.setNumNulls(mStatsObj.getNumNulls());
        stringStats.setAvgColLen(mStatsObj.getAvgColLen());
        stringStats.setMaxColLen(mStatsObj.getMaxColLen());
        stringStats.setNumDVs(mStatsObj.getNumDVs());
        stringStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
        colStatsData.setStringStats(stringStats);
    } else if (colType.equals("binary")) {
        BinaryColumnStatsData binaryStats = new BinaryColumnStatsData();
        binaryStats.setNumNulls(mStatsObj.getNumNulls());
        binaryStats.setAvgColLen(mStatsObj.getAvgColLen());
        binaryStats.setMaxColLen(mStatsObj.getMaxColLen());
        colStatsData.setBinaryStats(binaryStats);
    } else if (colType.equals("tinyint") || colType.equals("smallint") || colType.equals("int") || colType.equals("bigint")) {
        LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector();
        longStats.setNumNulls(mStatsObj.getNumNulls());
        if (mStatsObj.getLongHighValue() != null) {
            longStats.setHighValue(mStatsObj.getLongHighValue());
        }
        if (mStatsObj.getLongLowValue() != null) {
            longStats.setLowValue(mStatsObj.getLongLowValue());
        }
        longStats.setNumDVs(mStatsObj.getNumDVs());
        longStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
        colStatsData.setLongStats(longStats);
    } else if (colType.equals("double") || colType.equals("float")) {
        DoubleColumnStatsDataInspector doubleStats = new DoubleColumnStatsDataInspector();
        doubleStats.setNumNulls(mStatsObj.getNumNulls());
        if (mStatsObj.getDoubleHighValue() != null) {
            doubleStats.setHighValue(mStatsObj.getDoubleHighValue());
        }
        if (mStatsObj.getDoubleLowValue() != null) {
            doubleStats.setLowValue(mStatsObj.getDoubleLowValue());
        }
        doubleStats.setNumDVs(mStatsObj.getNumDVs());
        doubleStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
        colStatsData.setDoubleStats(doubleStats);
    } else if (colType.startsWith("decimal")) {
        DecimalColumnStatsDataInspector decimalStats = new DecimalColumnStatsDataInspector();
        decimalStats.setNumNulls(mStatsObj.getNumNulls());
        if (mStatsObj.getDecimalHighValue() != null) {
            decimalStats.setHighValue(DecimalUtils.createThriftDecimal(mStatsObj.getDecimalHighValue()));
        }
        if (mStatsObj.getDecimalLowValue() != null) {
            decimalStats.setLowValue(DecimalUtils.createThriftDecimal(mStatsObj.getDecimalLowValue()));
        }
        decimalStats.setNumDVs(mStatsObj.getNumDVs());
        decimalStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
        colStatsData.setDecimalStats(decimalStats);
    } else if (colType.equals("date")) {
        DateColumnStatsDataInspector dateStats = new DateColumnStatsDataInspector();
        dateStats.setNumNulls(mStatsObj.getNumNulls());
        Long highValue = mStatsObj.getLongHighValue();
        if (highValue != null) {
            dateStats.setHighValue(new Date(highValue));
        }
        Long lowValue = mStatsObj.getLongLowValue();
        if (lowValue != null) {
            dateStats.setLowValue(new Date(lowValue));
        }
        dateStats.setNumDVs(mStatsObj.getNumDVs());
        dateStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
        colStatsData.setDateStats(dateStats);
    } else if (colType.equals("timestamp")) {
        TimestampColumnStatsDataInspector timestampStats = new TimestampColumnStatsDataInspector();
        timestampStats.setNumNulls(mStatsObj.getNumNulls());
        Long highValue = mStatsObj.getLongHighValue();
        if (highValue != null) {
            timestampStats.setHighValue(new Timestamp(highValue));
        }
        Long lowValue = mStatsObj.getLongLowValue();
        if (lowValue != null) {
            timestampStats.setLowValue(new Timestamp(lowValue));
        }
        timestampStats.setNumDVs(mStatsObj.getNumDVs());
        timestampStats.setBitVectors((mStatsObj.getBitVector() == null || !enableBitVector) ? null : mStatsObj.getBitVector());
        colStatsData.setTimestampStats(timestampStats);
    }
    statsObj.setStatsData(colStatsData);
    return statsObj;
}
Also used : BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) DateColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector) Timestamp(org.apache.hadoop.hive.metastore.api.Timestamp) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData) Date(org.apache.hadoop.hive.metastore.api.Date) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) DecimalColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector) DoubleColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector) LongColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector) TimestampColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.TimestampColumnStatsDataInspector) StringColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Example 82 with ColumnStatisticsObj

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.

the class SharedCache method populateTableInCache.

public boolean populateTableInCache(Table table, TableCacheObjects cacheObjects) {
    String catName = StringUtils.normalizeIdentifier(table.getCatName());
    String dbName = StringUtils.normalizeIdentifier(table.getDbName());
    String tableName = StringUtils.normalizeIdentifier(table.getTableName());
    SQLAllTableConstraints constraints = cacheObjects.getTableConstraints();
    // 1. Don't add tables that were deleted while we were preparing list for prewarm
    if (tablesDeletedDuringPrewarm.contains(CacheUtils.buildTableKey(catName, dbName, tableName))) {
        return false;
    }
    TableWrapper tblWrapper = createTableWrapper(catName, dbName, tableName, table);
    if (!table.isSetPartitionKeys() && (cacheObjects.getTableColStats() != null)) {
        if (table.getPartitionKeys().isEmpty() && (cacheObjects.getTableColStats() != null)) {
            return false;
        }
    } else {
        if (cacheObjects.getPartitions() != null) {
            // If the partitions were not added due to memory limit, return false
            if (!tblWrapper.cachePartitions(cacheObjects.getPartitions(), this, true)) {
                return false;
            }
        }
        if (cacheObjects.getPartitionColStats() != null) {
            for (ColumnStatistics cs : cacheObjects.getPartitionColStats()) {
                List<String> partVal;
                try {
                    partVal = Warehouse.makeValsFromName(cs.getStatsDesc().getPartName(), null);
                    List<ColumnStatisticsObj> colStats = cs.getStatsObj();
                    if (!tblWrapper.updatePartitionColStats(partVal, colStats)) {
                        return false;
                    }
                } catch (MetaException e) {
                    LOG.debug("Unable to cache partition column stats for table: " + tableName, e);
                }
            }
        }
        tblWrapper.cacheAggrPartitionColStats(cacheObjects.getAggrStatsAllPartitions(), cacheObjects.getAggrStatsAllButDefaultPartition());
    }
    tblWrapper.setMemberCacheUpdated(MemberName.PARTITION_CACHE, false);
    tblWrapper.setMemberCacheUpdated(MemberName.TABLE_COL_STATS_CACHE, false);
    tblWrapper.setMemberCacheUpdated(MemberName.PARTITION_COL_STATS_CACHE, false);
    tblWrapper.setMemberCacheUpdated(MemberName.AGGR_COL_STATS_CACHE, false);
    if (tblWrapper.cacheConstraints(constraints, true)) {
        tblWrapper.setMemberCacheUpdated(MemberName.PRIMARY_KEY_CACHE, false);
        tblWrapper.setMemberCacheUpdated(MemberName.FOREIGN_KEY_CACHE, false);
        tblWrapper.setMemberCacheUpdated(MemberName.NOTNULL_CONSTRAINT_CACHE, false);
        tblWrapper.setMemberCacheUpdated(MemberName.UNIQUE_CONSTRAINT_CACHE, false);
        tblWrapper.setMemberCacheUpdated(MemberName.DEFAULT_CONSTRAINT_CACHE, false);
        tblWrapper.setMemberCacheUpdated(MemberName.CHECK_CONSTRAINT_CACHE, false);
    } else {
        return false;
    }
    try {
        cacheLock.writeLock().lock();
        // 2. Skip overwriting existing table object
        // (which is present because it was added after prewarm started)
        tableCache.put(CacheUtils.buildTableKey(catName, dbName, tableName), tblWrapper);
        return true;
    } finally {
        cacheLock.writeLock().unlock();
    }
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) SQLAllTableConstraints(org.apache.hadoop.hive.metastore.api.SQLAllTableConstraints) MetaException(org.apache.hadoop.hive.metastore.api.MetaException)

Example 83 with ColumnStatisticsObj

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.

the class MetaStoreServerUtils method mergeColStats.

// this function will merge csOld into csNew.
public static void mergeColStats(ColumnStatistics csNew, ColumnStatistics csOld) throws InvalidObjectException {
    List<ColumnStatisticsObj> list = new ArrayList<>();
    if (csNew.getStatsObj().size() != csOld.getStatsObjSize()) {
        // Some of the columns' stats are missing
        // This implies partition schema has changed. We will merge columns
        // present in both, overwrite stats for columns absent in metastore and
        // leave alone columns stats missing from stats task. This last case may
        // leave stats in stale state. This will be addressed later.
        LOG.debug("New ColumnStats size is {}, but old ColumnStats size is {}", csNew.getStatsObj().size(), csOld.getStatsObjSize());
    }
    // In this case, we have to find out which columns can be merged.
    Map<String, ColumnStatisticsObj> map = new HashMap<>();
    // We build a hash map from colName to object for old ColumnStats.
    for (ColumnStatisticsObj obj : csOld.getStatsObj()) {
        map.put(obj.getColName(), obj);
    }
    for (int index = 0; index < csNew.getStatsObj().size(); index++) {
        ColumnStatisticsObj statsObjNew = csNew.getStatsObj().get(index);
        ColumnStatisticsObj statsObjOld = map.get(statsObjNew.getColName());
        if (statsObjOld != null) {
            // column stats is still accurate.
            assert (statsObjNew.getStatsData().getSetField() == statsObjOld.getStatsData().getSetField());
            // If statsObjOld is found, we can merge.
            ColumnStatsMerger merger = ColumnStatsMergerFactory.getColumnStatsMerger(statsObjNew, statsObjOld);
            merger.merge(statsObjNew, statsObjOld);
        }
        // If statsObjOld is not found, we just use statsObjNew as it is accurate.
        list.add(statsObjNew);
    }
    // in all the other cases, we can not merge
    csNew.setStatsObj(list);
}
Also used : ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) ColumnStatsMerger(org.apache.hadoop.hive.metastore.columnstats.merge.ColumnStatsMerger)

Example 84 with ColumnStatisticsObj

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.

the class MetaStoreServerUtils method aggrPartitionStats.

public static List<ColumnStatisticsObj> aggrPartitionStats(Map<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> colStatsMap, final List<String> partNames, final boolean areAllPartsFound, final boolean useDensityFunctionForNDVEstimation, final double ndvTuner) throws MetaException {
    List<ColumnStatisticsObj> aggrColStatObjs = new ArrayList<ColumnStatisticsObj>();
    int numProcessors = Runtime.getRuntime().availableProcessors();
    final ExecutorService pool = Executors.newFixedThreadPool(Math.min(colStatsMap.size(), numProcessors), new ThreadFactoryBuilder().setDaemon(true).setNameFormat("aggr-col-stats-%d").build());
    final List<Future<ColumnStatisticsObj>> futures = Lists.newLinkedList();
    LOG.debug("Aggregating column stats. Threads used: {}", Math.min(colStatsMap.size(), numProcessors));
    long start = System.currentTimeMillis();
    for (final Map.Entry<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> entry : colStatsMap.entrySet()) {
        futures.add(pool.submit(new Callable<ColumnStatisticsObj>() {

            @Override
            public ColumnStatisticsObj call() throws MetaException {
                List<ColStatsObjWithSourceInfo> colStatWithSourceInfo = entry.getValue();
                ColumnStatsAggregator aggregator = entry.getKey();
                try {
                    ColumnStatisticsObj statsObj = aggregator.aggregate(colStatWithSourceInfo, partNames, areAllPartsFound);
                    return statsObj;
                } catch (MetaException e) {
                    LOG.debug(e.getMessage());
                    throw e;
                }
            }
        }));
    }
    pool.shutdown();
    if (!futures.isEmpty()) {
        for (Future<ColumnStatisticsObj> future : futures) {
            try {
                if (future.get() != null) {
                    aggrColStatObjs.add(future.get());
                }
            } catch (InterruptedException | ExecutionException e) {
                LOG.debug(e.getMessage());
                pool.shutdownNow();
                throw new MetaException(e.toString());
            }
        }
    }
    LOG.debug("Time for aggr col stats in seconds: {} Threads used: {}", ((System.currentTimeMillis() - (double) start)) / 1000, Math.min(colStatsMap.size(), numProcessors));
    return aggrColStatObjs;
}
Also used : ArrayList(java.util.ArrayList) Callable(java.util.concurrent.Callable) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatsAggregator(org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator) ExecutorService(java.util.concurrent.ExecutorService) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) Future(java.util.concurrent.Future) MachineList(org.apache.hadoop.util.MachineList) List(java.util.List) ArrayList(java.util.ArrayList) ExecutionException(java.util.concurrent.ExecutionException) Map(java.util.Map) SortedMap(java.util.SortedMap) HashMap(java.util.HashMap) TreeMap(java.util.TreeMap) MetaException(org.apache.hadoop.hive.metastore.api.MetaException)

Example 85 with ColumnStatisticsObj

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.

the class DateColumnStatsMergerTest method testMergeNullMinMaxValues.

@Test
public void testMergeNullMinMaxValues() {
    ColumnStatisticsObj old = new ColumnStatisticsObj();
    createData(old, null, null);
    merger.merge(old, old);
    Assert.assertNull(old.getStatsData().getDateStats().getLowValue());
    Assert.assertNull(old.getStatsData().getDateStats().getHighValue());
}
Also used : ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) Test(org.junit.Test) MetastoreUnitTest(org.apache.hadoop.hive.metastore.annotation.MetastoreUnitTest)

Aggregations

ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)219 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)104 ArrayList (java.util.ArrayList)98 ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)82 Test (org.junit.Test)79 ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)68 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)43 Table (org.apache.hadoop.hive.metastore.api.Table)43 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)35 Partition (org.apache.hadoop.hive.metastore.api.Partition)35 List (java.util.List)34 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)30 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)29 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)29 HashMap (java.util.HashMap)28 SerDeInfo (org.apache.hadoop.hive.metastore.api.SerDeInfo)28 DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)27 StringColumnStatsData (org.apache.hadoop.hive.metastore.api.StringColumnStatsData)25 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)23 BinaryColumnStatsData (org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)22