Search in sources :

Example 11 with PartitionIterable

use of org.apache.hadoop.hive.ql.metadata.PartitionIterable in project hive by apache.

the class RelOptHiveTable method updateColStats.

private void updateColStats(Set<Integer> projIndxLst, boolean allowMissingStats) {
    List<String> nonPartColNamesThatRqrStats = new ArrayList<String>();
    List<Integer> nonPartColIndxsThatRqrStats = new ArrayList<Integer>();
    List<String> partColNamesThatRqrStats = new ArrayList<String>();
    List<Integer> partColIndxsThatRqrStats = new ArrayList<Integer>();
    Set<String> colNamesFailedStats = new HashSet<String>();
    // 1. Separate required columns to Non Partition and Partition Cols
    ColumnInfo tmp;
    for (Integer pi : projIndxLst) {
        if (hiveColStatsMap.get(pi) == null) {
            if ((tmp = hiveNonPartitionColsMap.get(pi)) != null) {
                nonPartColNamesThatRqrStats.add(tmp.getInternalName());
                nonPartColIndxsThatRqrStats.add(pi);
            } else if ((tmp = hivePartitionColsMap.get(pi)) != null) {
                partColNamesThatRqrStats.add(tmp.getInternalName());
                partColIndxsThatRqrStats.add(pi);
            } else {
                noColsMissingStats.getAndIncrement();
                String logMsg = "Unable to find Column Index: " + pi + ", in " + hiveTblMetadata.getCompleteName();
                LOG.error(logMsg);
                throw new RuntimeException(logMsg);
            }
        }
    }
    if (null == partitionList) {
        // We could be here either because its an unpartitioned table or because
        // there are no pruning predicates on a partitioned table.
        computePartitionList(hiveConf, null, new HashSet<Integer>());
    }
    String partitionListKey = partitionList.getKey().orElse(null);
    ColumnStatsList colStatsCached = colStatsCache.get(partitionListKey);
    if (colStatsCached == null) {
        colStatsCached = new ColumnStatsList();
        colStatsCache.put(partitionListKey, colStatsCached);
    }
    // 2. Obtain Col Stats for Non Partition Cols
    if (nonPartColNamesThatRqrStats.size() > 0) {
        List<ColStatistics> hiveColStats = new ArrayList<ColStatistics>();
        if (!hiveTblMetadata.isPartitioned()) {
            // 2.1 Handle the case for unpartitioned table.
            try {
                Statistics stats = StatsUtils.collectStatistics(hiveConf, null, hiveTblMetadata, hiveNonPartitionCols, nonPartColNamesThatRqrStats, colStatsCached, nonPartColNamesThatRqrStats, true);
                rowCount = stats.getNumRows();
                for (String c : nonPartColNamesThatRqrStats) {
                    ColStatistics cs = stats.getColumnStatisticsFromColName(c);
                    if (cs != null) {
                        hiveColStats.add(cs);
                    }
                }
                colStatsCached.updateState(stats.getColumnStatsState());
                // 2.1.1 Record Column Names that we needed stats for but couldn't
                if (hiveColStats.isEmpty()) {
                    colNamesFailedStats.addAll(nonPartColNamesThatRqrStats);
                } else if (hiveColStats.size() != nonPartColNamesThatRqrStats.size()) {
                    Set<String> setOfFiledCols = new HashSet<String>(nonPartColNamesThatRqrStats);
                    Set<String> setOfObtainedColStats = new HashSet<String>();
                    for (ColStatistics cs : hiveColStats) {
                        setOfObtainedColStats.add(cs.getColumnName());
                    }
                    setOfFiledCols.removeAll(setOfObtainedColStats);
                    colNamesFailedStats.addAll(setOfFiledCols);
                } else {
                    // Column stats in hiveColStats might not be in the same order as the columns in
                    // nonPartColNamesThatRqrStats. reorder hiveColStats so we can build hiveColStatsMap
                    // using nonPartColIndxsThatRqrStats as below
                    Map<String, ColStatistics> columnStatsMap = new HashMap<String, ColStatistics>(hiveColStats.size());
                    for (ColStatistics cs : hiveColStats) {
                        columnStatsMap.put(cs.getColumnName(), cs);
                        // stats are not available
                        if (cs.isEstimated()) {
                            colNamesFailedStats.add(cs.getColumnName());
                        }
                    }
                    hiveColStats.clear();
                    for (String colName : nonPartColNamesThatRqrStats) {
                        hiveColStats.add(columnStatsMap.get(colName));
                    }
                }
            } catch (HiveException e) {
                String logMsg = "Collecting stats for table: " + hiveTblMetadata.getTableName() + " failed.";
                LOG.error(logMsg, e);
                throw new RuntimeException(logMsg, e);
            }
        } else {
            // 2.2 Obtain col stats for partitioned table.
            try {
                if (partitionList.getNotDeniedPartns().isEmpty()) {
                    // no need to make a metastore call
                    rowCount = 0;
                    hiveColStats = new ArrayList<ColStatistics>();
                    for (int i = 0; i < nonPartColNamesThatRqrStats.size(); i++) {
                        // add empty stats object for each column
                        hiveColStats.add(new ColStatistics(nonPartColNamesThatRqrStats.get(i), hiveNonPartitionColsMap.get(nonPartColIndxsThatRqrStats.get(i)).getTypeName()));
                    }
                    colNamesFailedStats.clear();
                    colStatsCached.updateState(State.COMPLETE);
                } else {
                    Statistics stats = StatsUtils.collectStatistics(hiveConf, partitionList, hiveTblMetadata, hiveNonPartitionCols, nonPartColNamesThatRqrStats, colStatsCached, nonPartColNamesThatRqrStats, true);
                    rowCount = stats.getNumRows();
                    hiveColStats = new ArrayList<ColStatistics>();
                    for (String c : nonPartColNamesThatRqrStats) {
                        ColStatistics cs = stats.getColumnStatisticsFromColName(c);
                        if (cs != null) {
                            hiveColStats.add(cs);
                            if (cs.isEstimated()) {
                                colNamesFailedStats.add(c);
                            }
                        } else {
                            colNamesFailedStats.add(c);
                        }
                    }
                    colStatsCached.updateState(stats.getColumnStatsState());
                }
            } catch (HiveException e) {
                String logMsg = "Collecting stats failed.";
                LOG.error(logMsg, e);
                throw new RuntimeException(logMsg, e);
            }
        }
        if (hiveColStats != null && hiveColStats.size() == nonPartColNamesThatRqrStats.size()) {
            for (int i = 0; i < hiveColStats.size(); i++) {
                // the columns in nonPartColIndxsThatRqrStats/nonPartColNamesThatRqrStats/hiveColStats
                // are in same order
                hiveColStatsMap.put(nonPartColIndxsThatRqrStats.get(i), hiveColStats.get(i));
                colStatsCached.put(hiveColStats.get(i).getColumnName(), hiveColStats.get(i));
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Stats for column " + hiveColStats.get(i).getColumnName() + " in table " + hiveTblMetadata.getTableName() + " stored in cache");
                    LOG.debug(hiveColStats.get(i).toString());
                }
            }
        }
    }
    // 3. Obtain Stats for Partition Cols
    if (colNamesFailedStats.isEmpty() && !partColNamesThatRqrStats.isEmpty()) {
        ColStatistics cStats = null;
        for (int i = 0; i < partColNamesThatRqrStats.size(); i++) {
            cStats = StatsUtils.getColStatsForPartCol(hivePartitionColsMap.get(partColIndxsThatRqrStats.get(i)), new PartitionIterable(partitionList.getNotDeniedPartns()), hiveConf);
            hiveColStatsMap.put(partColIndxsThatRqrStats.get(i), cStats);
            colStatsCached.put(cStats.getColumnName(), cStats);
            if (LOG.isDebugEnabled()) {
                LOG.debug("Stats for column " + cStats.getColumnName() + " in table " + hiveTblMetadata.getTableName() + " stored in cache");
                LOG.debug(cStats.toString());
            }
        }
    }
    // 4. Warn user if we could get stats for required columns
    if (!colNamesFailedStats.isEmpty()) {
        String logMsg = "No Stats for " + hiveTblMetadata.getCompleteName() + ", Columns: " + getColNamesForLogging(colNamesFailedStats);
        noColsMissingStats.getAndAdd(colNamesFailedStats.size());
        if (allowMissingStats) {
            LOG.warn(logMsg);
            HiveConf conf = SessionState.getSessionConf();
            if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_CBO_SHOW_WARNINGS)) {
                LogHelper console = SessionState.getConsole();
                console.printInfo(logMsg);
            }
        } else {
            LOG.error(logMsg);
            throw new RuntimeException(logMsg);
        }
    }
}
Also used : ImmutableBitSet(org.apache.calcite.util.ImmutableBitSet) Set(java.util.Set) HashSet(java.util.HashSet) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) LogHelper(org.apache.hadoop.hive.ql.session.SessionState.LogHelper) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) UniqueConstraint(org.apache.hadoop.hive.ql.metadata.UniqueConstraint) RelReferentialConstraint(org.apache.calcite.rel.RelReferentialConstraint) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) PartitionIterable(org.apache.hadoop.hive.ql.metadata.PartitionIterable) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) HiveConf(org.apache.hadoop.hive.conf.HiveConf) ColumnStatsList(org.apache.hadoop.hive.ql.parse.ColumnStatsList) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) HashSet(java.util.HashSet)

Aggregations

PartitionIterable (org.apache.hadoop.hive.ql.metadata.PartitionIterable)11 Partition (org.apache.hadoop.hive.ql.metadata.Partition)9 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)8 ArrayList (java.util.ArrayList)4 FileNotFoundException (java.io.FileNotFoundException)3 IOException (java.io.IOException)3 HashMap (java.util.HashMap)3 Path (org.apache.hadoop.fs.Path)3 UniqueConstraint (org.apache.hadoop.hive.ql.metadata.UniqueConstraint)3 AlterTableExchangePartition (org.apache.hadoop.hive.ql.plan.AlterTableExchangePartition)3 SQLException (java.sql.SQLException)2 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)2 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)2 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)2 NoSuchObjectException (org.apache.hadoop.hive.metastore.api.NoSuchObjectException)2 SQLCheckConstraint (org.apache.hadoop.hive.metastore.api.SQLCheckConstraint)2 SQLDefaultConstraint (org.apache.hadoop.hive.metastore.api.SQLDefaultConstraint)2 SQLNotNullConstraint (org.apache.hadoop.hive.metastore.api.SQLNotNullConstraint)2 SQLUniqueConstraint (org.apache.hadoop.hive.metastore.api.SQLUniqueConstraint)2 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)2