Search in sources :

Example 1 with Factory

use of org.apache.hadoop.hive.ql.stats.BasicStats.Factory in project hive by apache.

the class StatsUtils method collectStatistics.

private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, List<ColumnInfo> schema, List<String> neededColumns, ColumnStatsList colStatsCache, List<String> referencedColumns, boolean needColStats, boolean failIfCacheMiss) throws HiveException {
    Statistics stats = null;
    boolean fetchColStats = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_COLUMN_STATS);
    boolean estimateStats = HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_ESTIMATE_STATS);
    if (!table.isPartitioned()) {
        Factory basicStatsFactory = new BasicStats.Factory();
        if (estimateStats) {
            basicStatsFactory.addEnhancer(new BasicStats.DataSizeEstimator(conf));
        }
        // long ds = shouldEstimateStats? getDataSize(conf, table): getRawDataSize(table);
        basicStatsFactory.addEnhancer(new BasicStats.RowNumEstimator(estimateRowSizeFromSchema(conf, schema)));
        basicStatsFactory.addEnhancer(new BasicStats.SetMinRowNumber01());
        BasicStats basicStats = basicStatsFactory.build(Partish.buildFor(table));
        // long nr = getNumRows(conf, schema, neededColumns, table, ds);
        long ds = basicStats.getDataSize();
        long nr = basicStats.getNumRows();
        long fs = basicStats.getTotalFileSize();
        List<ColStatistics> colStats = Collections.emptyList();
        long numErasureCodedFiles = getErasureCodedFiles(table);
        if (needColStats) {
            colStats = getTableColumnStats(table, schema, neededColumns, colStatsCache, fetchColStats);
            if (estimateStats) {
                estimateStatsForMissingCols(neededColumns, colStats, table, conf, nr, schema);
            }
            // we should have stats for all columns (estimated or actual)
            if (neededColumns.size() == colStats.size()) {
                long betterDS = getDataSizeFromColumnStats(nr, colStats);
                ds = (betterDS < 1 || colStats.isEmpty()) ? ds : betterDS;
            }
        }
        stats = new Statistics(nr, ds, fs, numErasureCodedFiles);
        // infer if any column can be primary key based on column statistics
        inferAndSetPrimaryKey(stats.getNumRows(), colStats);
        stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
        stats.addToColumnStats(colStats);
    } else if (partList != null) {
        // For partitioned tables, get the size of all the partitions after pruning
        // the partitions that are not required
        Factory basicStatsFactory = new Factory();
        if (estimateStats) {
            // FIXME: misses parallel
            basicStatsFactory.addEnhancer(new BasicStats.DataSizeEstimator(conf));
        }
        basicStatsFactory.addEnhancer(new BasicStats.RowNumEstimator(estimateRowSizeFromSchema(conf, schema)));
        List<BasicStats> partStats = new ArrayList<>();
        for (Partition p : partList.getNotDeniedPartns()) {
            BasicStats basicStats = basicStatsFactory.build(Partish.buildFor(table, p));
            partStats.add(basicStats);
        }
        BasicStats bbs = BasicStats.buildFrom(partStats);
        long nr = bbs.getNumRows();
        long ds = bbs.getDataSize();
        long fs = bbs.getTotalFileSize();
        List<Long> erasureCodedFiles = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.NUM_ERASURE_CODED_FILES);
        long numErasureCodedFiles = getSumIgnoreNegatives(erasureCodedFiles);
        if (nr == 0) {
            nr = 1;
        }
        stats = new Statistics(nr, ds, fs, numErasureCodedFiles);
        stats.setBasicStatsState(bbs.getState());
        if (nr > 0) {
            // FIXME: this promotion process should be removed later
            if (State.PARTIAL.morePreciseThan(bbs.getState())) {
                stats.setBasicStatsState(State.PARTIAL);
            }
        }
        if (needColStats) {
            List<String> partitionCols = getPartitionColumns(schema, neededColumns, referencedColumns);
            // We will retrieve stats from the metastore only for columns that are not cached
            List<ColStatistics> columnStats = new ArrayList<>();
            List<String> neededColsToRetrieve = extractColumnStates(table, neededColumns, colStatsCache, columnStats);
            List<String> partitionColsToRetrieve = extractColumnStates(table, partitionCols, colStatsCache, columnStats);
            // List of partitions
            List<String> partNames = new ArrayList<>(partList.getNotDeniedPartns().size());
            for (Partition part : partList.getNotDeniedPartns()) {
                partNames.add(part.getName());
            }
            AggrStats aggrStats = null;
            // skip the step to connect to the metastore.
            if (fetchColStats && !neededColsToRetrieve.isEmpty() && !partNames.isEmpty()) {
                aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(), neededColsToRetrieve, partNames, false);
            }
            boolean statsRetrieved = aggrStats != null && aggrStats.getColStats() != null && aggrStats.getColStatsSize() != 0;
            if (neededColumns.isEmpty() || (!neededColsToRetrieve.isEmpty() && !statsRetrieved)) {
                estimateStatsForMissingCols(neededColsToRetrieve, columnStats, table, conf, nr, schema);
                // There are some partitions with no state (or we didn't fetch any state).
                // Update the stats with empty list to reflect that in the
                // state/initialize structures.
                // add partition column stats
                addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats);
                // FIXME: this add seems suspicious...10 lines below the value returned by this method used as betterDS
                stats.addToDataSize(getDataSizeFromColumnStats(nr, columnStats));
                stats.updateColumnStatsState(deriveStatType(columnStats, referencedColumns));
                stats.addToColumnStats(columnStats);
            } else {
                if (statsRetrieved) {
                    columnStats.addAll(convertColStats(aggrStats.getColStats(), table.getTableName()));
                }
                int colStatsAvailable = neededColumns.size() + partitionCols.size() - partitionColsToRetrieve.size();
                if (columnStats.size() != colStatsAvailable) {
                    LOG.debug("Column stats requested for : {} columns. Able to retrieve for {} columns", columnStats.size(), colStatsAvailable);
                }
                addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats);
                long betterDS = getDataSizeFromColumnStats(nr, columnStats);
                stats.setDataSize((betterDS < 1 || columnStats.isEmpty()) ? ds : betterDS);
                // infer if any column can be primary key based on column statistics
                inferAndSetPrimaryKey(stats.getNumRows(), columnStats);
                stats.addToColumnStats(columnStats);
                // Infer column stats state
                stats.setColumnStatsState(deriveStatType(columnStats, referencedColumns));
                if (neededColumns.size() != neededColsToRetrieve.size() || partitionCols.size() != partitionColsToRetrieve.size()) {
                    // Include state for cached columns
                    stats.updateColumnStatsState(colStatsCache.getState());
                }
                // Change if we could not retrieve for all partitions
                if (aggrStats != null && aggrStats.getPartsFound() != partNames.size() && stats.getColumnStatsState() != State.NONE) {
                    stats.updateColumnStatsState(State.PARTIAL);
                    LOG.debug("Column stats requested for : {} partitions. Able to retrieve for {} partitions", partNames.size(), aggrStats.getPartsFound());
                }
            }
            if (partStats.isEmpty()) {
                // all partitions are filtered by partition pruning
                stats.setBasicStatsState(State.COMPLETE);
            }
            // stats from metastore only once.
            if (colStatsCache != null && failIfCacheMiss && stats.getColumnStatsState().equals(State.COMPLETE) && (!neededColsToRetrieve.isEmpty() || !partitionColsToRetrieve.isEmpty())) {
                throw new HiveException("Cache has been loaded in logical planning phase for all columns; " + "however, stats for column some columns could not be retrieved from it " + "(see messages above)");
            }
        }
    }
    return stats;
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) LoggerFactory(org.slf4j.LoggerFactory) Factory(org.apache.hadoop.hive.ql.stats.BasicStats.Factory) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) ColumnStatsList(org.apache.hadoop.hive.ql.parse.ColumnStatsList) ArrayList(java.util.ArrayList) List(java.util.List)

Example 2 with Factory

use of org.apache.hadoop.hive.ql.stats.BasicStats.Factory in project hive by apache.

the class StatsUtils method getNumRows.

/**
 * Returns number of rows if it exists. Otherwise it estimates number of rows
 * based on estimated data size for both partition and non-partitioned table
 * RelOptHiveTable's getRowCount uses this.
 */
public static long getNumRows(HiveConf conf, List<ColumnInfo> schema, Table table, PrunedPartitionList partitionList, AtomicInteger noColsMissingStats) {
    List<Partish> inputs = new ArrayList<>();
    if (table.isPartitioned()) {
        for (Partition part : partitionList.getNotDeniedPartns()) {
            inputs.add(Partish.buildFor(table, part));
        }
    } else {
        inputs.add(Partish.buildFor(table));
    }
    Factory basicStatsFactory = new BasicStats.Factory();
    if (HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_ESTIMATE_STATS)) {
        basicStatsFactory.addEnhancer(new BasicStats.DataSizeEstimator(conf));
        basicStatsFactory.addEnhancer(new BasicStats.RowNumEstimator(estimateRowSizeFromSchema(conf, schema)));
    }
    List<BasicStats> results = new ArrayList<>();
    for (Partish pi : inputs) {
        BasicStats bStats = new BasicStats(pi);
        long nr = bStats.getNumRows();
        // FIXME: this point will be lost after the factory; check that it's really a warning....cleanup/etc
        if (nr <= 0) {
            // log warning if row count is missing
            noColsMissingStats.getAndIncrement();
        }
    }
    results = basicStatsFactory.buildAll(conf, inputs);
    BasicStats aggregateStat = BasicStats.buildFrom(results);
    aggregateStat.apply(new BasicStats.SetMinRowNumber01());
    return aggregateStat.getNumRows();
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) ArrayList(java.util.ArrayList) LoggerFactory(org.slf4j.LoggerFactory) Factory(org.apache.hadoop.hive.ql.stats.BasicStats.Factory)

Aggregations

ArrayList (java.util.ArrayList)2 Partition (org.apache.hadoop.hive.ql.metadata.Partition)2 Factory (org.apache.hadoop.hive.ql.stats.BasicStats.Factory)2 LoggerFactory (org.slf4j.LoggerFactory)2 List (java.util.List)1 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)1 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)1 ColumnStatsList (org.apache.hadoop.hive.ql.parse.ColumnStatsList)1 PrunedPartitionList (org.apache.hadoop.hive.ql.parse.PrunedPartitionList)1 ColStatistics (org.apache.hadoop.hive.ql.plan.ColStatistics)1 Statistics (org.apache.hadoop.hive.ql.plan.Statistics)1