Search in sources :

Example 1 with ColumnStatsList

use of org.apache.hadoop.hive.ql.parse.ColumnStatsList in project hive by apache.

the class StatsUtils method collectStatistics.

private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, List<ColumnInfo> schema, List<String> neededColumns, ColumnStatsList colStatsCache, List<String> referencedColumns, boolean needColStats, boolean failIfCacheMiss) throws HiveException {
    Statistics stats = null;
    boolean fetchColStats = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_FETCH_COLUMN_STATS);
    boolean estimateStats = HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_ESTIMATE_STATS);
    if (!table.isPartitioned()) {
        Factory basicStatsFactory = new BasicStats.Factory();
        if (estimateStats) {
            basicStatsFactory.addEnhancer(new BasicStats.DataSizeEstimator(conf));
        }
        // long ds = shouldEstimateStats? getDataSize(conf, table): getRawDataSize(table);
        basicStatsFactory.addEnhancer(new BasicStats.RowNumEstimator(estimateRowSizeFromSchema(conf, schema)));
        basicStatsFactory.addEnhancer(new BasicStats.SetMinRowNumber01());
        BasicStats basicStats = basicStatsFactory.build(Partish.buildFor(table));
        // long nr = getNumRows(conf, schema, neededColumns, table, ds);
        long ds = basicStats.getDataSize();
        long nr = basicStats.getNumRows();
        long fs = basicStats.getTotalFileSize();
        List<ColStatistics> colStats = Collections.emptyList();
        long numErasureCodedFiles = getErasureCodedFiles(table);
        if (needColStats) {
            colStats = getTableColumnStats(table, schema, neededColumns, colStatsCache, fetchColStats);
            if (estimateStats) {
                estimateStatsForMissingCols(neededColumns, colStats, table, conf, nr, schema);
            }
            // we should have stats for all columns (estimated or actual)
            if (neededColumns.size() == colStats.size()) {
                long betterDS = getDataSizeFromColumnStats(nr, colStats);
                ds = (betterDS < 1 || colStats.isEmpty()) ? ds : betterDS;
            }
        }
        stats = new Statistics(nr, ds, fs, numErasureCodedFiles);
        // infer if any column can be primary key based on column statistics
        inferAndSetPrimaryKey(stats.getNumRows(), colStats);
        stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
        stats.addToColumnStats(colStats);
    } else if (partList != null) {
        // For partitioned tables, get the size of all the partitions after pruning
        // the partitions that are not required
        Factory basicStatsFactory = new Factory();
        if (estimateStats) {
            // FIXME: misses parallel
            basicStatsFactory.addEnhancer(new BasicStats.DataSizeEstimator(conf));
        }
        basicStatsFactory.addEnhancer(new BasicStats.RowNumEstimator(estimateRowSizeFromSchema(conf, schema)));
        List<BasicStats> partStats = new ArrayList<>();
        for (Partition p : partList.getNotDeniedPartns()) {
            BasicStats basicStats = basicStatsFactory.build(Partish.buildFor(table, p));
            partStats.add(basicStats);
        }
        BasicStats bbs = BasicStats.buildFrom(partStats);
        long nr = bbs.getNumRows();
        long ds = bbs.getDataSize();
        long fs = bbs.getTotalFileSize();
        List<Long> erasureCodedFiles = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.NUM_ERASURE_CODED_FILES);
        long numErasureCodedFiles = getSumIgnoreNegatives(erasureCodedFiles);
        if (nr == 0) {
            nr = 1;
        }
        stats = new Statistics(nr, ds, fs, numErasureCodedFiles);
        stats.setBasicStatsState(bbs.getState());
        if (nr > 0) {
            // FIXME: this promotion process should be removed later
            if (State.PARTIAL.morePreciseThan(bbs.getState())) {
                stats.setBasicStatsState(State.PARTIAL);
            }
        }
        if (needColStats) {
            List<String> partitionCols = getPartitionColumns(schema, neededColumns, referencedColumns);
            // We will retrieve stats from the metastore only for columns that are not cached
            List<ColStatistics> columnStats = new ArrayList<>();
            List<String> neededColsToRetrieve = extractColumnStates(table, neededColumns, colStatsCache, columnStats);
            List<String> partitionColsToRetrieve = extractColumnStates(table, partitionCols, colStatsCache, columnStats);
            // List of partitions
            List<String> partNames = new ArrayList<>(partList.getNotDeniedPartns().size());
            for (Partition part : partList.getNotDeniedPartns()) {
                partNames.add(part.getName());
            }
            AggrStats aggrStats = null;
            // skip the step to connect to the metastore.
            if (fetchColStats && !neededColsToRetrieve.isEmpty() && !partNames.isEmpty()) {
                aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(), neededColsToRetrieve, partNames, false);
            }
            boolean statsRetrieved = aggrStats != null && aggrStats.getColStats() != null && aggrStats.getColStatsSize() != 0;
            if (neededColumns.isEmpty() || (!neededColsToRetrieve.isEmpty() && !statsRetrieved)) {
                estimateStatsForMissingCols(neededColsToRetrieve, columnStats, table, conf, nr, schema);
                // There are some partitions with no state (or we didn't fetch any state).
                // Update the stats with empty list to reflect that in the
                // state/initialize structures.
                // add partition column stats
                addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats);
                // FIXME: this add seems suspicious...10 lines below the value returned by this method used as betterDS
                stats.addToDataSize(getDataSizeFromColumnStats(nr, columnStats));
                stats.updateColumnStatsState(deriveStatType(columnStats, referencedColumns));
                stats.addToColumnStats(columnStats);
            } else {
                if (statsRetrieved) {
                    columnStats.addAll(convertColStats(aggrStats.getColStats(), table.getTableName()));
                }
                int colStatsAvailable = neededColumns.size() + partitionCols.size() - partitionColsToRetrieve.size();
                if (columnStats.size() != colStatsAvailable) {
                    LOG.debug("Column stats requested for : {} columns. Able to retrieve for {} columns", columnStats.size(), colStatsAvailable);
                }
                addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats);
                long betterDS = getDataSizeFromColumnStats(nr, columnStats);
                stats.setDataSize((betterDS < 1 || columnStats.isEmpty()) ? ds : betterDS);
                // infer if any column can be primary key based on column statistics
                inferAndSetPrimaryKey(stats.getNumRows(), columnStats);
                stats.addToColumnStats(columnStats);
                // Infer column stats state
                stats.setColumnStatsState(deriveStatType(columnStats, referencedColumns));
                if (neededColumns.size() != neededColsToRetrieve.size() || partitionCols.size() != partitionColsToRetrieve.size()) {
                    // Include state for cached columns
                    stats.updateColumnStatsState(colStatsCache.getState());
                }
                // Change if we could not retrieve for all partitions
                if (aggrStats != null && aggrStats.getPartsFound() != partNames.size() && stats.getColumnStatsState() != State.NONE) {
                    stats.updateColumnStatsState(State.PARTIAL);
                    LOG.debug("Column stats requested for : {} partitions. Able to retrieve for {} partitions", partNames.size(), aggrStats.getPartsFound());
                }
            }
            if (partStats.isEmpty()) {
                // all partitions are filtered by partition pruning
                stats.setBasicStatsState(State.COMPLETE);
            }
            // stats from metastore only once.
            if (colStatsCache != null && failIfCacheMiss && stats.getColumnStatsState().equals(State.COMPLETE) && (!neededColsToRetrieve.isEmpty() || !partitionColsToRetrieve.isEmpty())) {
                throw new HiveException("Cache has been loaded in logical planning phase for all columns; " + "however, stats for column some columns could not be retrieved from it " + "(see messages above)");
            }
        }
    }
    return stats;
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) LoggerFactory(org.slf4j.LoggerFactory) Factory(org.apache.hadoop.hive.ql.stats.BasicStats.Factory) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) ColumnStatsList(org.apache.hadoop.hive.ql.parse.ColumnStatsList) ArrayList(java.util.ArrayList) List(java.util.List)

Example 2 with ColumnStatsList

use of org.apache.hadoop.hive.ql.parse.ColumnStatsList in project hive by apache.

the class RelOptHiveTable method updateColStats.

private void updateColStats(Set<Integer> projIndxLst, boolean allowMissingStats) {
    List<String> nonPartColNamesThatRqrStats = new ArrayList<String>();
    List<Integer> nonPartColIndxsThatRqrStats = new ArrayList<Integer>();
    List<String> partColNamesThatRqrStats = new ArrayList<String>();
    List<Integer> partColIndxsThatRqrStats = new ArrayList<Integer>();
    Set<String> colNamesFailedStats = new HashSet<String>();
    // 1. Separate required columns to Non Partition and Partition Cols
    ColumnInfo tmp;
    for (Integer pi : projIndxLst) {
        if (hiveColStatsMap.get(pi) == null) {
            if ((tmp = hiveNonPartitionColsMap.get(pi)) != null) {
                nonPartColNamesThatRqrStats.add(tmp.getInternalName());
                nonPartColIndxsThatRqrStats.add(pi);
            } else if ((tmp = hivePartitionColsMap.get(pi)) != null) {
                partColNamesThatRqrStats.add(tmp.getInternalName());
                partColIndxsThatRqrStats.add(pi);
            } else {
                noColsMissingStats.getAndIncrement();
                String logMsg = "Unable to find Column Index: " + pi + ", in " + hiveTblMetadata.getCompleteName();
                LOG.error(logMsg);
                throw new RuntimeException(logMsg);
            }
        }
    }
    if (null == partitionList) {
        // We could be here either because its an unpartitioned table or because
        // there are no pruning predicates on a partitioned table.
        computePartitionList(hiveConf, null, new HashSet<Integer>());
    }
    String partitionListKey = partitionList.getKey().orElse(null);
    ColumnStatsList colStatsCached = colStatsCache.get(partitionListKey);
    if (colStatsCached == null) {
        colStatsCached = new ColumnStatsList();
        colStatsCache.put(partitionListKey, colStatsCached);
    }
    // 2. Obtain Col Stats for Non Partition Cols
    if (nonPartColNamesThatRqrStats.size() > 0) {
        List<ColStatistics> hiveColStats = new ArrayList<ColStatistics>();
        if (!hiveTblMetadata.isPartitioned()) {
            // 2.1 Handle the case for unpartitioned table.
            try {
                Statistics stats = StatsUtils.collectStatistics(hiveConf, null, hiveTblMetadata, hiveNonPartitionCols, nonPartColNamesThatRqrStats, colStatsCached, nonPartColNamesThatRqrStats, true);
                rowCount = stats.getNumRows();
                for (String c : nonPartColNamesThatRqrStats) {
                    ColStatistics cs = stats.getColumnStatisticsFromColName(c);
                    if (cs != null) {
                        hiveColStats.add(cs);
                    }
                }
                colStatsCached.updateState(stats.getColumnStatsState());
                // 2.1.1 Record Column Names that we needed stats for but couldn't
                if (hiveColStats.isEmpty()) {
                    colNamesFailedStats.addAll(nonPartColNamesThatRqrStats);
                } else if (hiveColStats.size() != nonPartColNamesThatRqrStats.size()) {
                    Set<String> setOfFiledCols = new HashSet<String>(nonPartColNamesThatRqrStats);
                    Set<String> setOfObtainedColStats = new HashSet<String>();
                    for (ColStatistics cs : hiveColStats) {
                        setOfObtainedColStats.add(cs.getColumnName());
                    }
                    setOfFiledCols.removeAll(setOfObtainedColStats);
                    colNamesFailedStats.addAll(setOfFiledCols);
                } else {
                    // Column stats in hiveColStats might not be in the same order as the columns in
                    // nonPartColNamesThatRqrStats. reorder hiveColStats so we can build hiveColStatsMap
                    // using nonPartColIndxsThatRqrStats as below
                    Map<String, ColStatistics> columnStatsMap = new HashMap<String, ColStatistics>(hiveColStats.size());
                    for (ColStatistics cs : hiveColStats) {
                        columnStatsMap.put(cs.getColumnName(), cs);
                        // stats are not available
                        if (cs.isEstimated()) {
                            colNamesFailedStats.add(cs.getColumnName());
                        }
                    }
                    hiveColStats.clear();
                    for (String colName : nonPartColNamesThatRqrStats) {
                        hiveColStats.add(columnStatsMap.get(colName));
                    }
                }
            } catch (HiveException e) {
                String logMsg = "Collecting stats for table: " + hiveTblMetadata.getTableName() + " failed.";
                LOG.error(logMsg, e);
                throw new RuntimeException(logMsg, e);
            }
        } else {
            // 2.2 Obtain col stats for partitioned table.
            try {
                if (partitionList.getNotDeniedPartns().isEmpty()) {
                    // no need to make a metastore call
                    rowCount = 0;
                    hiveColStats = new ArrayList<ColStatistics>();
                    for (int i = 0; i < nonPartColNamesThatRqrStats.size(); i++) {
                        // add empty stats object for each column
                        hiveColStats.add(new ColStatistics(nonPartColNamesThatRqrStats.get(i), hiveNonPartitionColsMap.get(nonPartColIndxsThatRqrStats.get(i)).getTypeName()));
                    }
                    colNamesFailedStats.clear();
                    colStatsCached.updateState(State.COMPLETE);
                } else {
                    Statistics stats = StatsUtils.collectStatistics(hiveConf, partitionList, hiveTblMetadata, hiveNonPartitionCols, nonPartColNamesThatRqrStats, colStatsCached, nonPartColNamesThatRqrStats, true);
                    rowCount = stats.getNumRows();
                    hiveColStats = new ArrayList<ColStatistics>();
                    for (String c : nonPartColNamesThatRqrStats) {
                        ColStatistics cs = stats.getColumnStatisticsFromColName(c);
                        if (cs != null) {
                            hiveColStats.add(cs);
                            if (cs.isEstimated()) {
                                colNamesFailedStats.add(c);
                            }
                        } else {
                            colNamesFailedStats.add(c);
                        }
                    }
                    colStatsCached.updateState(stats.getColumnStatsState());
                }
            } catch (HiveException e) {
                String logMsg = "Collecting stats failed.";
                LOG.error(logMsg, e);
                throw new RuntimeException(logMsg, e);
            }
        }
        if (hiveColStats != null && hiveColStats.size() == nonPartColNamesThatRqrStats.size()) {
            for (int i = 0; i < hiveColStats.size(); i++) {
                // the columns in nonPartColIndxsThatRqrStats/nonPartColNamesThatRqrStats/hiveColStats
                // are in same order
                hiveColStatsMap.put(nonPartColIndxsThatRqrStats.get(i), hiveColStats.get(i));
                colStatsCached.put(hiveColStats.get(i).getColumnName(), hiveColStats.get(i));
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Stats for column " + hiveColStats.get(i).getColumnName() + " in table " + hiveTblMetadata.getTableName() + " stored in cache");
                    LOG.debug(hiveColStats.get(i).toString());
                }
            }
        }
    }
    // 3. Obtain Stats for Partition Cols
    if (colNamesFailedStats.isEmpty() && !partColNamesThatRqrStats.isEmpty()) {
        ColStatistics cStats = null;
        for (int i = 0; i < partColNamesThatRqrStats.size(); i++) {
            cStats = StatsUtils.getColStatsForPartCol(hivePartitionColsMap.get(partColIndxsThatRqrStats.get(i)), new PartitionIterable(partitionList.getNotDeniedPartns()), hiveConf);
            hiveColStatsMap.put(partColIndxsThatRqrStats.get(i), cStats);
            colStatsCached.put(cStats.getColumnName(), cStats);
            if (LOG.isDebugEnabled()) {
                LOG.debug("Stats for column " + cStats.getColumnName() + " in table " + hiveTblMetadata.getTableName() + " stored in cache");
                LOG.debug(cStats.toString());
            }
        }
    }
    // 4. Warn user if we could get stats for required columns
    if (!colNamesFailedStats.isEmpty()) {
        String logMsg = "No Stats for " + hiveTblMetadata.getCompleteName() + ", Columns: " + getColNamesForLogging(colNamesFailedStats);
        noColsMissingStats.getAndAdd(colNamesFailedStats.size());
        if (allowMissingStats) {
            LOG.warn(logMsg);
            HiveConf conf = SessionState.getSessionConf();
            if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_CBO_SHOW_WARNINGS)) {
                LogHelper console = SessionState.getConsole();
                console.printInfo(logMsg);
            }
        } else {
            LOG.error(logMsg);
            throw new RuntimeException(logMsg);
        }
    }
}
Also used : ImmutableBitSet(org.apache.calcite.util.ImmutableBitSet) Set(java.util.Set) HashSet(java.util.HashSet) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) LogHelper(org.apache.hadoop.hive.ql.session.SessionState.LogHelper) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) UniqueConstraint(org.apache.hadoop.hive.ql.metadata.UniqueConstraint) RelReferentialConstraint(org.apache.calcite.rel.RelReferentialConstraint) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) PartitionIterable(org.apache.hadoop.hive.ql.metadata.PartitionIterable) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) HiveConf(org.apache.hadoop.hive.conf.HiveConf) ColumnStatsList(org.apache.hadoop.hive.ql.parse.ColumnStatsList) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) HashSet(java.util.HashSet)

Aggregations

ArrayList (java.util.ArrayList)2 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)2 ColumnStatsList (org.apache.hadoop.hive.ql.parse.ColumnStatsList)2 ColStatistics (org.apache.hadoop.hive.ql.plan.ColStatistics)2 Statistics (org.apache.hadoop.hive.ql.plan.Statistics)2 ImmutableMap (com.google.common.collect.ImmutableMap)1 HashMap (java.util.HashMap)1 HashSet (java.util.HashSet)1 List (java.util.List)1 Map (java.util.Map)1 Set (java.util.Set)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 RelReferentialConstraint (org.apache.calcite.rel.RelReferentialConstraint)1 ImmutableBitSet (org.apache.calcite.util.ImmutableBitSet)1 HiveConf (org.apache.hadoop.hive.conf.HiveConf)1 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)1 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)1 Partition (org.apache.hadoop.hive.ql.metadata.Partition)1 PartitionIterable (org.apache.hadoop.hive.ql.metadata.PartitionIterable)1 UniqueConstraint (org.apache.hadoop.hive.ql.metadata.UniqueConstraint)1