Search in sources :

Example 21 with Statistics

use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.

the class RelOptHiveTable method updateColStats.

private void updateColStats(Set<Integer> projIndxLst, boolean allowMissingStats) {
    List<String> nonPartColNamesThatRqrStats = new ArrayList<String>();
    List<Integer> nonPartColIndxsThatRqrStats = new ArrayList<Integer>();
    List<String> partColNamesThatRqrStats = new ArrayList<String>();
    List<Integer> partColIndxsThatRqrStats = new ArrayList<Integer>();
    Set<String> colNamesFailedStats = new HashSet<String>();
    // 1. Separate required columns to Non Partition and Partition Cols
    ColumnInfo tmp;
    for (Integer pi : projIndxLst) {
        if (hiveColStatsMap.get(pi) == null) {
            if ((tmp = hiveNonPartitionColsMap.get(pi)) != null) {
                nonPartColNamesThatRqrStats.add(tmp.getInternalName());
                nonPartColIndxsThatRqrStats.add(pi);
            } else if ((tmp = hivePartitionColsMap.get(pi)) != null) {
                partColNamesThatRqrStats.add(tmp.getInternalName());
                partColIndxsThatRqrStats.add(pi);
            } else {
                noColsMissingStats.getAndIncrement();
                String logMsg = "Unable to find Column Index: " + pi + ", in " + hiveTblMetadata.getCompleteName();
                LOG.error(logMsg);
                throw new RuntimeException(logMsg);
            }
        }
    }
    if (null == partitionList) {
        // We could be here either because its an unpartitioned table or because
        // there are no pruning predicates on a partitioned table.
        computePartitionList(hiveConf, null, new HashSet<Integer>());
    }
    String partitionListKey = partitionList.getKey().orElse(null);
    ColumnStatsList colStatsCached = colStatsCache.get(partitionListKey);
    if (colStatsCached == null) {
        colStatsCached = new ColumnStatsList();
        colStatsCache.put(partitionListKey, colStatsCached);
    }
    // 2. Obtain Col Stats for Non Partition Cols
    if (nonPartColNamesThatRqrStats.size() > 0) {
        List<ColStatistics> hiveColStats = new ArrayList<ColStatistics>();
        if (!hiveTblMetadata.isPartitioned()) {
            // 2.1 Handle the case for unpartitioned table.
            try {
                Statistics stats = StatsUtils.collectStatistics(hiveConf, null, hiveTblMetadata, hiveNonPartitionCols, nonPartColNamesThatRqrStats, colStatsCached, nonPartColNamesThatRqrStats, true);
                rowCount = stats.getNumRows();
                for (String c : nonPartColNamesThatRqrStats) {
                    ColStatistics cs = stats.getColumnStatisticsFromColName(c);
                    if (cs != null) {
                        hiveColStats.add(cs);
                    }
                }
                colStatsCached.updateState(stats.getColumnStatsState());
                // 2.1.1 Record Column Names that we needed stats for but couldn't
                if (hiveColStats.isEmpty()) {
                    colNamesFailedStats.addAll(nonPartColNamesThatRqrStats);
                } else if (hiveColStats.size() != nonPartColNamesThatRqrStats.size()) {
                    Set<String> setOfFiledCols = new HashSet<String>(nonPartColNamesThatRqrStats);
                    Set<String> setOfObtainedColStats = new HashSet<String>();
                    for (ColStatistics cs : hiveColStats) {
                        setOfObtainedColStats.add(cs.getColumnName());
                    }
                    setOfFiledCols.removeAll(setOfObtainedColStats);
                    colNamesFailedStats.addAll(setOfFiledCols);
                } else {
                    // Column stats in hiveColStats might not be in the same order as the columns in
                    // nonPartColNamesThatRqrStats. reorder hiveColStats so we can build hiveColStatsMap
                    // using nonPartColIndxsThatRqrStats as below
                    Map<String, ColStatistics> columnStatsMap = new HashMap<String, ColStatistics>(hiveColStats.size());
                    for (ColStatistics cs : hiveColStats) {
                        columnStatsMap.put(cs.getColumnName(), cs);
                        // stats are not available
                        if (cs.isEstimated()) {
                            colNamesFailedStats.add(cs.getColumnName());
                        }
                    }
                    hiveColStats.clear();
                    for (String colName : nonPartColNamesThatRqrStats) {
                        hiveColStats.add(columnStatsMap.get(colName));
                    }
                }
            } catch (HiveException e) {
                String logMsg = "Collecting stats for table: " + hiveTblMetadata.getTableName() + " failed.";
                LOG.error(logMsg, e);
                throw new RuntimeException(logMsg, e);
            }
        } else {
            // 2.2 Obtain col stats for partitioned table.
            try {
                if (partitionList.getNotDeniedPartns().isEmpty()) {
                    // no need to make a metastore call
                    rowCount = 0;
                    hiveColStats = new ArrayList<ColStatistics>();
                    for (int i = 0; i < nonPartColNamesThatRqrStats.size(); i++) {
                        // add empty stats object for each column
                        hiveColStats.add(new ColStatistics(nonPartColNamesThatRqrStats.get(i), hiveNonPartitionColsMap.get(nonPartColIndxsThatRqrStats.get(i)).getTypeName()));
                    }
                    colNamesFailedStats.clear();
                    colStatsCached.updateState(State.COMPLETE);
                } else {
                    Statistics stats = StatsUtils.collectStatistics(hiveConf, partitionList, hiveTblMetadata, hiveNonPartitionCols, nonPartColNamesThatRqrStats, colStatsCached, nonPartColNamesThatRqrStats, true);
                    rowCount = stats.getNumRows();
                    hiveColStats = new ArrayList<ColStatistics>();
                    for (String c : nonPartColNamesThatRqrStats) {
                        ColStatistics cs = stats.getColumnStatisticsFromColName(c);
                        if (cs != null) {
                            hiveColStats.add(cs);
                            if (cs.isEstimated()) {
                                colNamesFailedStats.add(c);
                            }
                        } else {
                            colNamesFailedStats.add(c);
                        }
                    }
                    colStatsCached.updateState(stats.getColumnStatsState());
                }
            } catch (HiveException e) {
                String logMsg = "Collecting stats failed.";
                LOG.error(logMsg, e);
                throw new RuntimeException(logMsg, e);
            }
        }
        if (hiveColStats != null && hiveColStats.size() == nonPartColNamesThatRqrStats.size()) {
            for (int i = 0; i < hiveColStats.size(); i++) {
                // the columns in nonPartColIndxsThatRqrStats/nonPartColNamesThatRqrStats/hiveColStats
                // are in same order
                hiveColStatsMap.put(nonPartColIndxsThatRqrStats.get(i), hiveColStats.get(i));
                colStatsCached.put(hiveColStats.get(i).getColumnName(), hiveColStats.get(i));
                if (LOG.isDebugEnabled()) {
                    LOG.debug("Stats for column " + hiveColStats.get(i).getColumnName() + " in table " + hiveTblMetadata.getTableName() + " stored in cache");
                    LOG.debug(hiveColStats.get(i).toString());
                }
            }
        }
    }
    // 3. Obtain Stats for Partition Cols
    if (colNamesFailedStats.isEmpty() && !partColNamesThatRqrStats.isEmpty()) {
        ColStatistics cStats = null;
        for (int i = 0; i < partColNamesThatRqrStats.size(); i++) {
            cStats = StatsUtils.getColStatsForPartCol(hivePartitionColsMap.get(partColIndxsThatRqrStats.get(i)), new PartitionIterable(partitionList.getNotDeniedPartns()), hiveConf);
            hiveColStatsMap.put(partColIndxsThatRqrStats.get(i), cStats);
            colStatsCached.put(cStats.getColumnName(), cStats);
            if (LOG.isDebugEnabled()) {
                LOG.debug("Stats for column " + cStats.getColumnName() + " in table " + hiveTblMetadata.getTableName() + " stored in cache");
                LOG.debug(cStats.toString());
            }
        }
    }
    // 4. Warn user if we could get stats for required columns
    if (!colNamesFailedStats.isEmpty()) {
        String logMsg = "No Stats for " + hiveTblMetadata.getCompleteName() + ", Columns: " + getColNamesForLogging(colNamesFailedStats);
        noColsMissingStats.getAndAdd(colNamesFailedStats.size());
        if (allowMissingStats) {
            LOG.warn(logMsg);
            HiveConf conf = SessionState.getSessionConf();
            if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_CBO_SHOW_WARNINGS)) {
                LogHelper console = SessionState.getConsole();
                console.printInfo(logMsg);
            }
        } else {
            LOG.error(logMsg);
            throw new RuntimeException(logMsg);
        }
    }
}
Also used : ImmutableBitSet(org.apache.calcite.util.ImmutableBitSet) Set(java.util.Set) HashSet(java.util.HashSet) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) LogHelper(org.apache.hadoop.hive.ql.session.SessionState.LogHelper) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) UniqueConstraint(org.apache.hadoop.hive.ql.metadata.UniqueConstraint) RelReferentialConstraint(org.apache.calcite.rel.RelReferentialConstraint) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) PartitionIterable(org.apache.hadoop.hive.ql.metadata.PartitionIterable) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) HiveConf(org.apache.hadoop.hive.conf.HiveConf) ColumnStatsList(org.apache.hadoop.hive.ql.parse.ColumnStatsList) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) HashMap(java.util.HashMap) HashSet(java.util.HashSet)

Example 22 with Statistics

use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.

the class SparkMapJoinOptimizer method getMapJoinConversionInfo.

/**
 *   This method returns the big table position in a map-join. If the given join
 *   cannot be converted to a map-join (This could happen for several reasons - one
 *   of them being presence of 2 or more big tables that cannot fit in-memory), it returns -1.
 *
 *   Otherwise, it returns an int value that is the index of the big table in the set
 *   MapJoinProcessor.bigTableCandidateSet
 *
 * @param joinOp
 * @param context
 * @return an array of 3 long values, first value is the position,
 *   second value is the connected map join size, and the third is big table data size.
 */
private long[] getMapJoinConversionInfo(JoinOperator joinOp, OptimizeSparkProcContext context) {
    Set<Integer> bigTableCandidateSet = MapJoinProcessor.getBigTableCandidates(joinOp.getConf().getConds());
    long maxSize = context.getConf().getLongVar(HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD);
    int bigTablePosition = -1;
    Statistics bigInputStat = null;
    long totalSize = 0;
    int pos = 0;
    // bigTableFound means we've encountered a table that's bigger than the
    // max. This table is either the big table or we cannot convert.
    boolean bigTableFound = false;
    boolean useTsStats = context.getConf().getBoolean(HiveConf.ConfVars.SPARK_USE_TS_STATS_FOR_MAPJOIN.varname, false);
    // If so, mark that branch as the big table branch.
    if (useTsStats) {
        LOG.debug("Checking map join optimization for operator {} using TS stats", joinOp);
        for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
            if (isBigTableBranch(parentOp)) {
                if (bigTablePosition < 0 && bigTableCandidateSet.contains(pos) && !containUnionWithoutRS(parentOp.getParentOperators().get(0))) {
                    LOG.debug("Found a big table branch with parent operator {} and position {}", parentOp, pos);
                    bigTablePosition = pos;
                    bigTableFound = true;
                    bigInputStat = new Statistics(0, Long.MAX_VALUE, Long.MAX_VALUE, 0);
                } else {
                    // Either we've found multiple big table branches, or the current branch cannot
                    // be a big table branch. Disable mapjoin for these cases.
                    LOG.debug("Cannot enable map join optimization for operator {}", joinOp);
                    return new long[] { -1, 0, 0 };
                }
            }
            pos++;
        }
    }
    pos = 0;
    for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
        // Skip the potential big table identified above
        if (pos == bigTablePosition) {
            pos++;
            continue;
        }
        Statistics currInputStat = null;
        if (useTsStats) {
            // Not adding other stats (e.g., # of rows, col stats) since only data size is used here
            for (TableScanOperator root : OperatorUtils.findOperatorsUpstream(parentOp, TableScanOperator.class)) {
                if (currInputStat == null) {
                    currInputStat = root.getStatistics().clone();
                } else {
                    currInputStat.addBasicStats(root.getStatistics());
                }
            }
        } else {
            currInputStat = parentOp.getStatistics();
        }
        if (currInputStat == null) {
            LOG.warn("Couldn't get statistics from: " + parentOp);
            return new long[] { -1, 0, 0 };
        }
        // But, this is tricky to implement, and we'll leave it as a future work for now.
        if (containUnionWithoutRS(parentOp.getParentOperators().get(0))) {
            return new long[] { -1, 0, 0 };
        }
        long inputSize = currInputStat.getDataSize();
        if (bigInputStat == null || inputSize > bigInputStat.getDataSize()) {
            if (bigTableFound) {
                // on size and there's another one that's bigger.
                return new long[] { -1, 0, 0 };
            }
            if (inputSize > maxSize) {
                if (!bigTableCandidateSet.contains(pos)) {
                    // big for the map side.
                    return new long[] { -1, 0, 0 };
                }
                bigTableFound = true;
            }
            if (bigInputStat != null) {
                // we're replacing the current big table with a new one. Need
                // to count the current one as a map table then.
                totalSize += bigInputStat.getDataSize();
            }
            if (totalSize > maxSize) {
                // hence cannot convert.
                return new long[] { -1, 0, 0 };
            }
            if (bigTableCandidateSet.contains(pos)) {
                bigTablePosition = pos;
                bigInputStat = currInputStat;
            }
        } else {
            totalSize += currInputStat.getDataSize();
            if (totalSize > maxSize) {
                // cannot hold all map tables in memory. Cannot convert.
                return new long[] { -1, 0, 0 };
            }
        }
        pos++;
    }
    if (bigTablePosition == -1) {
        // No big table candidates.
        return new long[] { -1, 0, 0 };
    }
    // Final check, find size of already-calculated Mapjoin Operators in same work (spark-stage).
    // We need to factor this in to prevent overwhelming Spark executor-memory.
    long connectedMapJoinSize = getConnectedMapJoinSize(joinOp.getParentOperators().get(bigTablePosition), joinOp, context);
    if ((connectedMapJoinSize + totalSize) > maxSize) {
        return new long[] { -1, 0, 0 };
    }
    return new long[] { bigTablePosition, connectedMapJoinSize, totalSize };
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Statistics(org.apache.hadoop.hive.ql.plan.Statistics)

Example 23 with Statistics

use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.

the class TestSharedWorkOptimizer method testTSCmpOrdersByDataSizeDesc.

@Test
public void testTSCmpOrdersByDataSizeDesc() {
    TableScanOperator ts1 = getTsOp();
    TableScanOperator ts2 = getTsOp();
    TableScanOperator ts3 = getTsOp();
    ts1.setStatistics(new Statistics(100, 100, 1, 1));
    ts2.setStatistics(new Statistics(1000, 1000, 1, 1));
    ts3.setStatistics(new Statistics(10, 10, 1, 1));
    ArrayList<TableScanOperator> li1 = Lists.newArrayList(ts1, ts3, ts2);
    li1.sort(new TSComparator());
    assertTrue(li1.get(0).getStatistics().getDataSize() == 1000);
    assertTrue(li1.get(1).getStatistics().getDataSize() == 100);
    assertTrue(li1.get(2).getStatistics().getDataSize() == 10);
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) TSComparator(org.apache.hadoop.hive.ql.optimizer.SharedWorkOptimizer.TSComparator) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) Test(org.junit.Test)

Aggregations

Statistics (org.apache.hadoop.hive.ql.plan.Statistics)23 ColStatistics (org.apache.hadoop.hive.ql.plan.ColStatistics)18 ArrayList (java.util.ArrayList)7 AnnotateWithStatistics (org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics)7 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)5 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)5 List (java.util.List)3 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)3 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)3 MapJoinDesc (org.apache.hadoop.hive.ql.plan.MapJoinDesc)3 PlanMapper (org.apache.hadoop.hive.ql.plan.mapper.PlanMapper)3 HashMap (java.util.HashMap)2 HiveConf (org.apache.hadoop.hive.conf.HiveConf)2 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)2 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)2 CommonJoinOperator (org.apache.hadoop.hive.ql.exec.CommonJoinOperator)2 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)2 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)2 Operator (org.apache.hadoop.hive.ql.exec.Operator)2 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)2