Search in sources :

Example 36 with AggrStats

use of org.apache.hadoop.hive.metastore.api.AggrStats in project hive by apache.

the class CachedStore method prewarm.

@VisibleForTesting
static /**
 * This initializes the caches in SharedCache by getting the objects from Metastore DB via
 * ObjectStore and populating the respective caches
 *
 * @param rawStore
 * @throws Exception
 */
void prewarm(RawStore rawStore) {
    if (isCachePrewarmed.get()) {
        return;
    }
    long startTime = System.nanoTime();
    LOG.info("Prewarming CachedStore");
    while (!isCachePrewarmed.get()) {
        // Prevents throwing exceptions in our raw store calls since we're not using RawStoreProxy
        Deadline.registerIfNot(1000000);
        List<String> dbNames;
        try {
            dbNames = rawStore.getAllDatabases();
        } catch (MetaException e) {
            // Try again
            continue;
        }
        LOG.info("Number of databases to prewarm: {}", dbNames.size());
        List<Database> databases = new ArrayList<>(dbNames.size());
        for (String dbName : dbNames) {
            try {
                databases.add(rawStore.getDatabase(dbName));
            } catch (NoSuchObjectException e) {
                // Continue with next database
                continue;
            }
        }
        sharedCache.populateDatabasesInCache(databases);
        LOG.debug("Databases cache is now prewarmed. Now adding tables, partitions and statistics to the cache");
        int numberOfDatabasesCachedSoFar = 0;
        for (String dbName : dbNames) {
            dbName = StringUtils.normalizeIdentifier(dbName);
            List<String> tblNames;
            try {
                tblNames = rawStore.getAllTables(dbName);
            } catch (MetaException e) {
                // Continue with next database
                continue;
            }
            int numberOfTablesCachedSoFar = 0;
            for (String tblName : tblNames) {
                tblName = StringUtils.normalizeIdentifier(tblName);
                if (!shouldCacheTable(dbName, tblName)) {
                    continue;
                }
                Table table;
                try {
                    table = rawStore.getTable(dbName, tblName);
                } catch (MetaException e) {
                    // in that case, continue with the next table
                    continue;
                }
                List<String> colNames = MetaStoreUtils.getColumnNamesForTable(table);
                try {
                    ColumnStatistics tableColStats = null;
                    List<Partition> partitions = null;
                    List<ColumnStatistics> partitionColStats = null;
                    AggrStats aggrStatsAllPartitions = null;
                    AggrStats aggrStatsAllButDefaultPartition = null;
                    if (table.isSetPartitionKeys()) {
                        Deadline.startTimer("getPartitions");
                        partitions = rawStore.getPartitions(dbName, tblName, Integer.MAX_VALUE);
                        Deadline.stopTimer();
                        List<String> partNames = new ArrayList<>(partitions.size());
                        for (Partition p : partitions) {
                            partNames.add(Warehouse.makePartName(table.getPartitionKeys(), p.getValues()));
                        }
                        if (!partNames.isEmpty()) {
                            // Get partition column stats for this table
                            Deadline.startTimer("getPartitionColumnStatistics");
                            partitionColStats = rawStore.getPartitionColumnStatistics(dbName, tblName, partNames, colNames);
                            Deadline.stopTimer();
                            // Get aggregate stats for all partitions of a table and for all but default
                            // partition
                            Deadline.startTimer("getAggrPartitionColumnStatistics");
                            aggrStatsAllPartitions = rawStore.get_aggr_stats_for(dbName, tblName, partNames, colNames);
                            Deadline.stopTimer();
                            // Remove default partition from partition names and get aggregate
                            // stats again
                            List<FieldSchema> partKeys = table.getPartitionKeys();
                            String defaultPartitionValue = MetastoreConf.getVar(rawStore.getConf(), ConfVars.DEFAULTPARTITIONNAME);
                            List<String> partCols = new ArrayList<>();
                            List<String> partVals = new ArrayList<>();
                            for (FieldSchema fs : partKeys) {
                                partCols.add(fs.getName());
                                partVals.add(defaultPartitionValue);
                            }
                            String defaultPartitionName = FileUtils.makePartName(partCols, partVals);
                            partNames.remove(defaultPartitionName);
                            Deadline.startTimer("getAggrPartitionColumnStatistics");
                            aggrStatsAllButDefaultPartition = rawStore.get_aggr_stats_for(dbName, tblName, partNames, colNames);
                            Deadline.stopTimer();
                        }
                    } else {
                        Deadline.startTimer("getTableColumnStatistics");
                        tableColStats = rawStore.getTableColumnStatistics(dbName, tblName, colNames);
                        Deadline.stopTimer();
                    }
                    sharedCache.populateTableInCache(table, tableColStats, partitions, partitionColStats, aggrStatsAllPartitions, aggrStatsAllButDefaultPartition);
                } catch (MetaException | NoSuchObjectException e) {
                    // Continue with next table
                    continue;
                }
                LOG.debug("Processed database: {}'s table: {}. Cached {} / {}  tables so far.", dbName, tblName, ++numberOfTablesCachedSoFar, tblNames.size());
            }
            LOG.debug("Processed database: {}. Cached {} / {} databases so far.", dbName, ++numberOfDatabasesCachedSoFar, dbNames.size());
        }
        isCachePrewarmed.set(true);
    }
    LOG.info("CachedStore initialized");
    long endTime = System.nanoTime();
    LOG.info("Time taken in prewarming = " + (endTime - startTime) / 1000000 + "ms");
    sharedCache.completeTableCachePrewarm();
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) Partition(org.apache.hadoop.hive.metastore.api.Partition) Table(org.apache.hadoop.hive.metastore.api.Table) AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ArrayList(java.util.ArrayList) SQLCheckConstraint(org.apache.hadoop.hive.metastore.api.SQLCheckConstraint) SQLNotNullConstraint(org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint) SQLUniqueConstraint(org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint) SQLDefaultConstraint(org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint) Database(org.apache.hadoop.hive.metastore.api.Database) NoSuchObjectException(org.apache.hadoop.hive.metastore.api.NoSuchObjectException) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) VisibleForTesting(com.google.common.annotations.VisibleForTesting)

Example 37 with AggrStats

use of org.apache.hadoop.hive.metastore.api.AggrStats in project hive by apache.

the class StatsUtils method collectStatistics.

private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, List<ColumnInfo> schema, List<String> neededColumns, ColumnStatsList colStatsCache, List<String> referencedColumns, boolean needColStats, boolean failIfCacheMiss) throws HiveException {
    Statistics stats = null;
    boolean fetchColStats = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_COLUMN_STATS);
    boolean estimateStats = HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_ESTIMATE_STATS);
    if (!table.isPartitioned()) {
        Factory basicStatsFactory = new BasicStats.Factory();
        if (estimateStats) {
            basicStatsFactory.addEnhancer(new BasicStats.DataSizeEstimator(conf));
        }
        // long ds = shouldEstimateStats? getDataSize(conf, table): getRawDataSize(table);
        basicStatsFactory.addEnhancer(new BasicStats.RowNumEstimator(estimateRowSizeFromSchema(conf, schema)));
        basicStatsFactory.addEnhancer(new BasicStats.SetMinRowNumber01());
        BasicStats basicStats = basicStatsFactory.build(Partish.buildFor(table));
        // long nr = getNumRows(conf, schema, neededColumns, table, ds);
        long ds = basicStats.getDataSize();
        long nr = basicStats.getNumRows();
        long fs = basicStats.getTotalFileSize();
        List<ColStatistics> colStats = Collections.emptyList();
        long numErasureCodedFiles = getErasureCodedFiles(table);
        if (needColStats) {
            colStats = getTableColumnStats(table, schema, neededColumns, colStatsCache, fetchColStats);
            if (estimateStats) {
                estimateStatsForMissingCols(neededColumns, colStats, table, conf, nr, schema);
            }
            // we should have stats for all columns (estimated or actual)
            if (neededColumns.size() == colStats.size()) {
                long betterDS = getDataSizeFromColumnStats(nr, colStats);
                ds = (betterDS < 1 || colStats.isEmpty()) ? ds : betterDS;
            }
        }
        stats = new Statistics(nr, ds, fs, numErasureCodedFiles);
        // infer if any column can be primary key based on column statistics
        inferAndSetPrimaryKey(stats.getNumRows(), colStats);
        stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
        stats.addToColumnStats(colStats);
    } else if (partList != null) {
        // For partitioned tables, get the size of all the partitions after pruning
        // the partitions that are not required
        Factory basicStatsFactory = new Factory();
        if (estimateStats) {
            // FIXME: misses parallel
            basicStatsFactory.addEnhancer(new BasicStats.DataSizeEstimator(conf));
        }
        basicStatsFactory.addEnhancer(new BasicStats.RowNumEstimator(estimateRowSizeFromSchema(conf, schema)));
        List<BasicStats> partStats = new ArrayList<>();
        for (Partition p : partList.getNotDeniedPartns()) {
            BasicStats basicStats = basicStatsFactory.build(Partish.buildFor(table, p));
            partStats.add(basicStats);
        }
        BasicStats bbs = BasicStats.buildFrom(partStats);
        long nr = bbs.getNumRows();
        long ds = bbs.getDataSize();
        long fs = bbs.getTotalFileSize();
        List<Long> erasureCodedFiles = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.NUM_ERASURE_CODED_FILES);
        long numErasureCodedFiles = getSumIgnoreNegatives(erasureCodedFiles);
        if (nr == 0) {
            nr = 1;
        }
        stats = new Statistics(nr, ds, fs, numErasureCodedFiles);
        stats.setBasicStatsState(bbs.getState());
        if (nr > 0) {
            // FIXME: this promotion process should be removed later
            if (State.PARTIAL.morePreciseThan(bbs.getState())) {
                stats.setBasicStatsState(State.PARTIAL);
            }
        }
        if (needColStats) {
            List<String> partitionCols = getPartitionColumns(schema, neededColumns, referencedColumns);
            // We will retrieve stats from the metastore only for columns that are not cached
            List<ColStatistics> columnStats = new ArrayList<>();
            List<String> neededColsToRetrieve = extractColumnStates(table, neededColumns, colStatsCache, columnStats);
            List<String> partitionColsToRetrieve = extractColumnStates(table, partitionCols, colStatsCache, columnStats);
            // List of partitions
            List<String> partNames = new ArrayList<>(partList.getNotDeniedPartns().size());
            for (Partition part : partList.getNotDeniedPartns()) {
                partNames.add(part.getName());
            }
            AggrStats aggrStats = null;
            // skip the step to connect to the metastore.
            if (fetchColStats && !neededColsToRetrieve.isEmpty() && !partNames.isEmpty()) {
                aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(), neededColsToRetrieve, partNames, false);
            }
            boolean statsRetrieved = aggrStats != null && aggrStats.getColStats() != null && aggrStats.getColStatsSize() != 0;
            if (neededColumns.isEmpty() || (!neededColsToRetrieve.isEmpty() && !statsRetrieved)) {
                estimateStatsForMissingCols(neededColsToRetrieve, columnStats, table, conf, nr, schema);
                // There are some partitions with no state (or we didn't fetch any state).
                // Update the stats with empty list to reflect that in the
                // state/initialize structures.
                // add partition column stats
                addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats);
                // FIXME: this add seems suspicious...10 lines below the value returned by this method used as betterDS
                stats.addToDataSize(getDataSizeFromColumnStats(nr, columnStats));
                stats.updateColumnStatsState(deriveStatType(columnStats, referencedColumns));
                stats.addToColumnStats(columnStats);
            } else {
                if (statsRetrieved) {
                    columnStats.addAll(convertColStats(aggrStats.getColStats(), table.getTableName()));
                }
                int colStatsAvailable = neededColumns.size() + partitionCols.size() - partitionColsToRetrieve.size();
                if (columnStats.size() != colStatsAvailable) {
                    LOG.debug("Column stats requested for : {} columns. Able to retrieve for {} columns", columnStats.size(), colStatsAvailable);
                }
                addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats);
                long betterDS = getDataSizeFromColumnStats(nr, columnStats);
                stats.setDataSize((betterDS < 1 || columnStats.isEmpty()) ? ds : betterDS);
                // infer if any column can be primary key based on column statistics
                inferAndSetPrimaryKey(stats.getNumRows(), columnStats);
                stats.addToColumnStats(columnStats);
                // Infer column stats state
                stats.setColumnStatsState(deriveStatType(columnStats, referencedColumns));
                if (neededColumns.size() != neededColsToRetrieve.size() || partitionCols.size() != partitionColsToRetrieve.size()) {
                    // Include state for cached columns
                    stats.updateColumnStatsState(colStatsCache.getState());
                }
                // Change if we could not retrieve for all partitions
                if (aggrStats != null && aggrStats.getPartsFound() != partNames.size() && stats.getColumnStatsState() != State.NONE) {
                    stats.updateColumnStatsState(State.PARTIAL);
                    LOG.debug("Column stats requested for : {} partitions. Able to retrieve for {} partitions", partNames.size(), aggrStats.getPartsFound());
                }
            }
            if (partStats.isEmpty()) {
                // all partitions are filtered by partition pruning
                stats.setBasicStatsState(State.COMPLETE);
            }
            // stats from metastore only once.
            if (colStatsCache != null && failIfCacheMiss && stats.getColumnStatsState().equals(State.COMPLETE) && (!neededColsToRetrieve.isEmpty() || !partitionColsToRetrieve.isEmpty())) {
                throw new HiveException("Cache has been loaded in logical planning phase for all columns; " + "however, stats for column some columns could not be retrieved from it " + "(see messages above)");
            }
        }
    }
    return stats;
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) LoggerFactory(org.slf4j.LoggerFactory) Factory(org.apache.hadoop.hive.ql.stats.BasicStats.Factory) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) ColumnStatsList(org.apache.hadoop.hive.ql.parse.ColumnStatsList) ArrayList(java.util.ArrayList) List(java.util.List)

Example 38 with AggrStats

use of org.apache.hadoop.hive.metastore.api.AggrStats in project hive by apache.

the class HiveMetaStoreClientWithLocalCache method getAggrStatsForInternal.

@Override
protected AggrStats getAggrStatsForInternal(PartitionsStatsRequest req) throws TException {
    if (isCacheEnabledAndInitialized()) {
        TableWatermark watermark = new TableWatermark(req.getValidWriteIdList(), getTable(req.getDbName(), req.getTblName()).getId());
        if (watermark.isValid()) {
            CacheKey cacheKey = new CacheKey(KeyType.AGGR_COL_STATS, watermark, req);
            AggrStats r = (AggrStats) mscLocalCache.getIfPresent(cacheKey);
            if (r == null) {
                r = super.getAggrStatsForInternal(req);
                mscLocalCache.put(cacheKey, r);
            } else {
                LOG.debug("HS2 level HMS cache: method=getAggrStatsForInternal, dbName={}, tblName={}, partNames={}", req.getDbName(), req.getTblName(), req.getPartNames());
            }
            if (LOG.isDebugEnabled() && recordStats) {
                LOG.debug(cacheObjName + ": " + mscLocalCache.stats().toString());
            }
            return r;
        }
    }
    return super.getAggrStatsForInternal(req);
}
Also used : AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats)

Example 39 with AggrStats

use of org.apache.hadoop.hive.metastore.api.AggrStats in project hive by apache.

the class SessionHiveMetaStoreClient method getAggrStatsForInternal.

@Override
protected AggrStats getAggrStatsForInternal(PartitionsStatsRequest req) throws TException {
    Map<Object, Object> queryCache = getQueryCache();
    if (queryCache != null) {
        // Retrieve or populate cache
        CacheKey cacheKey = new CacheKey(KeyType.AGGR_COL_STATS, req);
        AggrStats v = (AggrStats) queryCache.get(cacheKey);
        if (v == null) {
            v = super.getAggrStatsForInternal(req);
            queryCache.put(cacheKey, v);
        } else {
            LOG.debug("Query level HMS cache: method=getAggrStatsForInternal, dbName={}, tblName={}, partNames={}", req.getDbName(), req.getTblName(), req.getPartNames());
        }
        return v;
    }
    return super.getAggrStatsForInternal(req);
}
Also used : AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats)

Example 40 with AggrStats

use of org.apache.hadoop.hive.metastore.api.AggrStats in project hive by apache.

the class TestStats method compareStatsForPartitions.

private void compareStatsForPartitions(String catName, String dbName, String tableName, List<String> partNames, final Map<String, Column> colMap) throws TException {
    Map<String, List<ColumnStatisticsObj>> partObjs = catName.equals(NO_CAT) ? client.getPartitionColumnStatistics(dbName, tableName, partNames, new ArrayList<>(colMap.keySet()), ENGINE) : client.getPartitionColumnStatistics(catName, dbName, tableName, partNames, new ArrayList<>(colMap.keySet()), ENGINE);
    for (int i = 0; i < partNames.size(); i++) {
        compareStatsForOneTableOrPartition(partObjs.get(partNames.get(i)), i, colMap);
    }
    AggrStats aggr = catName.equals(NO_CAT) ? client.getAggrColStatsFor(dbName, tableName, new ArrayList<>(colMap.keySet()), partNames, ENGINE) : client.getAggrColStatsFor(catName, dbName, tableName, new ArrayList<>(colMap.keySet()), partNames, ENGINE);
    Assert.assertEquals(partNames.size(), aggr.getPartsFound());
    Assert.assertEquals(colMap.size(), aggr.getColStatsSize());
    aggr.getColStats().forEach(cso -> colMap.get(cso.getColName()).compareAggr(cso));
    // Test column stats obtained through getPartitions call
    for (int i = 0; i < partNames.size(); i++) {
        String partName = partNames.get(i);
        List<Partition> partitions = catName.equals(NO_CAT) ? client.getPartitionsByNames(dbName, tableName, Collections.singletonList(partName), true, ENGINE) : client.getPartitionsByNames(catName, dbName, tableName, Collections.singletonList(partName), true, ENGINE);
        Partition partition = partitions.get(0);
        compareStatsForOneTableOrPartition(partition.getColStats().getStatsObj(), i, colMap);
        // Also test that we do not get statistics when not requested
        partitions = catName.equals(NO_CAT) ? client.getPartitionsByNames(dbName, tableName, Collections.singletonList(partName), true, ENGINE) : client.getPartitionsByNames(catName, dbName, tableName, Collections.singletonList(partName), true, ENGINE);
        partition = partitions.get(0);
        Assert.assertFalse(partition.isSetColStats());
    }
}
Also used : Partition(org.apache.hadoop.hive.metastore.api.Partition) AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List)

Aggregations

AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)42 ArrayList (java.util.ArrayList)37 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)29 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)29 Partition (org.apache.hadoop.hive.metastore.api.Partition)28 Table (org.apache.hadoop.hive.metastore.api.Table)28 Test (org.junit.Test)28 SerDeInfo (org.apache.hadoop.hive.metastore.api.SerDeInfo)27 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)27 ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)26 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)26 ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)25 List (java.util.List)21 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)12 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)5 DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)5 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)4 IOException (java.io.IOException)3 SQLCheckConstraint (org.apache.hadoop.hive.metastore.api.SQLCheckConstraint)3 SQLDefaultConstraint (org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint)3