Search in sources :

Example 11 with Statistics

use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.

the class StatsUtils method collectStatistics.

private static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, List<ColumnInfo> schema, List<String> neededColumns, ColumnStatsList colStatsCache, List<String> referencedColumns, boolean fetchColStats, boolean failIfCacheMiss) throws HiveException {
    Statistics stats = null;
    float deserFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR);
    boolean shouldEstimateStats = HiveConf.getBoolVar(conf, ConfVars.HIVE_STATS_ESTIMATE_STATS);
    if (!table.isPartitioned()) {
        // getDataSize tries to estimate stats if it doesn't exist using file size
        // we would like to avoid file system calls  if it too expensive
        long ds = shouldEstimateStats ? getDataSize(conf, table) : getRawDataSize(table);
        long nr = getNumRows(conf, schema, table, ds);
        List<ColStatistics> colStats = Lists.newArrayList();
        if (fetchColStats) {
            colStats = getTableColumnStats(table, schema, neededColumns, colStatsCache);
            if (colStats == null) {
                colStats = Lists.newArrayList();
            }
            estimateStatsForMissingCols(neededColumns, colStats, table, conf, nr, schema);
            // we should have stats for all columns (estimated or actual)
            assert (neededColumns.size() == colStats.size());
            long betterDS = getDataSizeFromColumnStats(nr, colStats);
            ds = (betterDS < 1 || colStats.isEmpty()) ? ds : betterDS;
        }
        stats = new Statistics(nr, ds);
        // infer if any column can be primary key based on column statistics
        inferAndSetPrimaryKey(stats.getNumRows(), colStats);
        stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
        stats.addToColumnStats(colStats);
    } else if (partList != null) {
        // For partitioned tables, get the size of all the partitions after pruning
        // the partitions that are not required
        long nr = 0;
        long ds = 0;
        List<Long> rowCounts = Lists.newArrayList();
        List<Long> dataSizes = Lists.newArrayList();
        rowCounts = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT);
        dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.RAW_DATA_SIZE);
        nr = getSumIgnoreNegatives(rowCounts);
        ds = getSumIgnoreNegatives(dataSizes);
        if (ds <= 0) {
            dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.TOTAL_SIZE);
            dataSizes = safeMult(dataSizes, deserFactor);
            ds = getSumIgnoreNegatives(dataSizes);
        }
        // sizes
        if (ds <= 0 && shouldEstimateStats) {
            dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns());
            dataSizes = safeMult(dataSizes, deserFactor);
            ds = getSumIgnoreNegatives(dataSizes);
        }
        int avgRowSize = estimateRowSizeFromSchema(conf, schema);
        if (avgRowSize > 0) {
            setUnknownRcDsToAverage(rowCounts, dataSizes, avgRowSize);
            nr = getSumIgnoreNegatives(rowCounts);
            ds = getSumIgnoreNegatives(dataSizes);
            // number of rows -1 means that statistics from metastore is not reliable
            if (nr <= 0) {
                nr = ds / avgRowSize;
            }
        }
        // Minimum values
        if (nr == 0) {
            nr = 1;
        }
        stats = new Statistics(nr, ds);
        // if at least a partition does not contain row count then mark basic stats state as PARTIAL
        if (containsNonPositives(rowCounts) && stats.getBasicStatsState().equals(State.COMPLETE)) {
            stats.setBasicStatsState(State.PARTIAL);
        }
        if (fetchColStats) {
            List<String> partitionCols = getPartitionColumns(schema, neededColumns, referencedColumns);
            // We will retrieve stats from the metastore only for columns that are not cached
            List<String> neededColsToRetrieve;
            List<String> partitionColsToRetrieve;
            List<ColStatistics> columnStats = new ArrayList<>();
            if (colStatsCache != null) {
                neededColsToRetrieve = new ArrayList<String>(neededColumns.size());
                for (String colName : neededColumns) {
                    ColStatistics colStats = colStatsCache.getColStats().get(colName);
                    if (colStats == null) {
                        neededColsToRetrieve.add(colName);
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("Stats for column " + colName + " in table " + table.getCompleteName() + " could not be retrieved from cache");
                        }
                    } else {
                        columnStats.add(colStats);
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("Stats for column " + colName + " in table " + table.getCompleteName() + " retrieved from cache");
                        }
                    }
                }
                partitionColsToRetrieve = new ArrayList<>(partitionCols.size());
                for (String colName : partitionCols) {
                    ColStatistics colStats = colStatsCache.getColStats().get(colName);
                    if (colStats == null) {
                        partitionColsToRetrieve.add(colName);
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("Stats for column " + colName + " in table " + table.getCompleteName() + " could not be retrieved from cache");
                        }
                    } else {
                        columnStats.add(colStats);
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("Stats for column " + colName + " in table " + table.getCompleteName() + " retrieved from cache");
                        }
                    }
                }
            } else {
                neededColsToRetrieve = neededColumns;
                partitionColsToRetrieve = partitionCols;
            }
            // List of partitions
            List<String> partNames = new ArrayList<String>(partList.getNotDeniedPartns().size());
            for (Partition part : partList.getNotDeniedPartns()) {
                partNames.add(part.getName());
            }
            AggrStats aggrStats = null;
            // skip the step to connect to the metastore.
            if (neededColsToRetrieve.size() > 0 && partNames.size() > 0) {
                aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(), neededColsToRetrieve, partNames);
            }
            boolean statsRetrieved = aggrStats != null && aggrStats.getColStats() != null && aggrStats.getColStatsSize() != 0;
            if (neededColumns.size() == 0 || (neededColsToRetrieve.size() > 0 && !statsRetrieved)) {
                estimateStatsForMissingCols(neededColsToRetrieve, columnStats, table, conf, nr, schema);
                // There are some partitions with no state (or we didn't fetch any state).
                // Update the stats with empty list to reflect that in the
                // state/initialize structures.
                // add partition column stats
                addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats);
                // FIXME: this add seems suspicious...10 lines below the value returned by this method used as betterDS
                stats.addToDataSize(getDataSizeFromColumnStats(nr, columnStats));
                stats.updateColumnStatsState(deriveStatType(columnStats, referencedColumns));
                stats.addToColumnStats(columnStats);
            } else {
                if (statsRetrieved) {
                    columnStats.addAll(convertColStats(aggrStats.getColStats(), table.getTableName()));
                }
                int colStatsAvailable = neededColumns.size() + partitionCols.size() - partitionColsToRetrieve.size();
                if (columnStats.size() != colStatsAvailable) {
                    LOG.debug("Column stats requested for : {} columns. Able to retrieve for {} columns", columnStats.size(), colStatsAvailable);
                }
                addPartitionColumnStats(conf, partitionColsToRetrieve, schema, table, partList, columnStats);
                long betterDS = getDataSizeFromColumnStats(nr, columnStats);
                stats.setDataSize((betterDS < 1 || columnStats.isEmpty()) ? ds : betterDS);
                // infer if any column can be primary key based on column statistics
                inferAndSetPrimaryKey(stats.getNumRows(), columnStats);
                stats.addToColumnStats(columnStats);
                // Infer column stats state
                stats.setColumnStatsState(deriveStatType(columnStats, referencedColumns));
                if (neededColumns.size() != neededColsToRetrieve.size() || partitionCols.size() != partitionColsToRetrieve.size()) {
                    // Include state for cached columns
                    stats.updateColumnStatsState(colStatsCache.getState());
                }
                // Change if we could not retrieve for all partitions
                if (aggrStats != null && aggrStats.getPartsFound() != partNames.size() && stats.getColumnStatsState() != State.NONE) {
                    stats.updateColumnStatsState(State.PARTIAL);
                    LOG.debug("Column stats requested for : {} partitions. Able to retrieve for {} partitions", partNames.size(), aggrStats.getPartsFound());
                }
            }
            if (rowCounts.size() == 0) {
                // all partitions are filtered by partition pruning
                stats.setBasicStatsState(State.COMPLETE);
            }
            // stats from metastore only once.
            if (colStatsCache != null && failIfCacheMiss && stats.getColumnStatsState().equals(State.COMPLETE) && (!neededColsToRetrieve.isEmpty() || !partitionColsToRetrieve.isEmpty())) {
                throw new HiveException("Cache has been loaded in logical planning phase for all columns; " + "however, stats for column some columns could not be retrieved from it " + "(see messages above)");
            }
        }
    }
    return stats;
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) ArrayList(java.util.ArrayList) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) ColumnStatsList(org.apache.hadoop.hive.ql.parse.ColumnStatsList) ArrayList(java.util.ArrayList) List(java.util.List)

Example 12 with Statistics

use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.

the class SparkMapJoinOptimizer method getMapJoinConversionInfo.

/**
 *   This method returns the big table position in a map-join. If the given join
 *   cannot be converted to a map-join (This could happen for several reasons - one
 *   of them being presence of 2 or more big tables that cannot fit in-memory), it returns -1.
 *
 *   Otherwise, it returns an int value that is the index of the big table in the set
 *   MapJoinProcessor.bigTableCandidateSet
 *
 * @param joinOp
 * @param context
 * @return an array of 3 long values, first value is the position,
 *   second value is the connected map join size, and the third is big table data size.
 */
private long[] getMapJoinConversionInfo(JoinOperator joinOp, OptimizeSparkProcContext context) {
    Set<Integer> bigTableCandidateSet = MapJoinProcessor.getBigTableCandidates(joinOp.getConf().getConds());
    long maxSize = context.getConf().getLongVar(HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD);
    int bigTablePosition = -1;
    Statistics bigInputStat = null;
    long totalSize = 0;
    int pos = 0;
    // bigTableFound means we've encountered a table that's bigger than the
    // max. This table is either the big table or we cannot convert.
    boolean bigTableFound = false;
    boolean useTsStats = context.getConf().getBoolean(HiveConf.ConfVars.SPARK_USE_TS_STATS_FOR_MAPJOIN.varname, false);
    // If so, mark that branch as the big table branch.
    if (useTsStats) {
        LOG.debug("Checking map join optimization for operator {} using TS stats", joinOp);
        for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
            if (isBigTableBranch(parentOp)) {
                if (bigTablePosition < 0 && bigTableCandidateSet.contains(pos) && !containUnionWithoutRS(parentOp.getParentOperators().get(0))) {
                    LOG.debug("Found a big table branch with parent operator {} and position {}", parentOp, pos);
                    bigTablePosition = pos;
                    bigTableFound = true;
                    bigInputStat = new Statistics(0, Long.MAX_VALUE);
                } else {
                    // Either we've found multiple big table branches, or the current branch cannot
                    // be a big table branch. Disable mapjoin for these cases.
                    LOG.debug("Cannot enable map join optimization for operator {}", joinOp);
                    return new long[] { -1, 0, 0 };
                }
            }
            pos++;
        }
    }
    pos = 0;
    for (Operator<? extends OperatorDesc> parentOp : joinOp.getParentOperators()) {
        // Skip the potential big table identified above
        if (pos == bigTablePosition) {
            pos++;
            continue;
        }
        Statistics currInputStat = null;
        if (useTsStats) {
            // Not adding other stats (e.g., # of rows, col stats) since only data size is used here
            for (TableScanOperator root : OperatorUtils.findOperatorsUpstream(parentOp, TableScanOperator.class)) {
                if (currInputStat == null) {
                    currInputStat = root.getStatistics().clone();
                } else {
                    currInputStat.addBasicStats(root.getStatistics());
                }
            }
        } else {
            currInputStat = parentOp.getStatistics();
        }
        if (currInputStat == null) {
            LOG.warn("Couldn't get statistics from: " + parentOp);
            return new long[] { -1, 0, 0 };
        }
        // But, this is tricky to implement, and we'll leave it as a future work for now.
        if (containUnionWithoutRS(parentOp.getParentOperators().get(0))) {
            return new long[] { -1, 0, 0 };
        }
        long inputSize = currInputStat.getDataSize();
        if (bigInputStat == null || inputSize > bigInputStat.getDataSize()) {
            if (bigTableFound) {
                // on size and there's another one that's bigger.
                return new long[] { -1, 0, 0 };
            }
            if (inputSize > maxSize) {
                if (!bigTableCandidateSet.contains(pos)) {
                    // big for the map side.
                    return new long[] { -1, 0, 0 };
                }
                bigTableFound = true;
            }
            if (bigInputStat != null) {
                // we're replacing the current big table with a new one. Need
                // to count the current one as a map table then.
                totalSize += bigInputStat.getDataSize();
            }
            if (totalSize > maxSize) {
                // hence cannot convert.
                return new long[] { -1, 0, 0 };
            }
            if (bigTableCandidateSet.contains(pos)) {
                bigTablePosition = pos;
                bigInputStat = currInputStat;
            }
        } else {
            totalSize += currInputStat.getDataSize();
            if (totalSize > maxSize) {
                // cannot hold all map tables in memory. Cannot convert.
                return new long[] { -1, 0, 0 };
            }
        }
        pos++;
    }
    if (bigTablePosition == -1) {
        // No big table candidates.
        return new long[] { -1, 0, 0 };
    }
    // Final check, find size of already-calculated Mapjoin Operators in same work (spark-stage).
    // We need to factor this in to prevent overwhelming Spark executor-memory.
    long connectedMapJoinSize = getConnectedMapJoinSize(joinOp.getParentOperators().get(bigTablePosition), joinOp, context);
    if ((connectedMapJoinSize + totalSize) > maxSize) {
        return new long[] { -1, 0, 0 };
    }
    return new long[] { bigTablePosition, connectedMapJoinSize, totalSize };
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Statistics(org.apache.hadoop.hive.ql.plan.Statistics)

Example 13 with Statistics

use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.

the class TezCompiler method getBloomFilterCost.

private static double getBloomFilterCost(SelectOperator sel, FilterOperator fil) {
    double cost = -1;
    Statistics selStats = sel.getStatistics();
    if (selStats != null) {
        cost = selStats.getNumRows();
    // Some other things that could be added here to model cost:
    // Cost of computing/sending partial BloomFilter results? BloomFilterSize * # mappers
    // For reduce-side join, add the cost of the semijoin table scan/dependent tablescans?
    }
    return cost;
}
Also used : AnnotateWithStatistics(org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Example 14 with Statistics

use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.

the class TezCompiler method computeBloomFilterNetBenefit.

private static double computeBloomFilterNetBenefit(SelectOperator sel, ExprNodeDesc selExpr, FilterOperator fil, ExprNodeDesc tsExpr) {
    double netBenefit = -1;
    double benefit = getBloomFilterBenefit(sel, selExpr, fil, tsExpr);
    Statistics filStats = fil.getStatistics();
    if (benefit > 0 && filStats != null) {
        double cost = getBloomFilterCost(sel, fil);
        if (cost > 0) {
            long filDataSize = filStats.getNumRows();
            netBenefit = (benefit - cost) / filDataSize;
            LOG.debug("BloomFilter benefit=" + benefit + ", cost=" + cost + ", tsDataSize=" + filDataSize + ", netBenefit=" + (benefit - cost));
        }
    }
    LOG.debug("netBenefit=" + netBenefit);
    return netBenefit;
}
Also used : AnnotateWithStatistics(org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Example 15 with Statistics

use of org.apache.hadoop.hive.ql.plan.Statistics in project hive by apache.

the class TezCompiler method getBloomFilterBenefit.

private static double getBloomFilterBenefit(SelectOperator sel, ExprNodeDesc selExpr, FilterOperator fil, ExprNodeDesc tsExpr) {
    double benefit = -1;
    Statistics selStats = sel.getStatistics();
    Statistics filStats = fil.getStatistics();
    if (selStats == null || filStats == null) {
        LOG.debug("No stats available to compute BloomFilter benefit");
        return benefit;
    }
    // For cardinality values use numRows as default, try to use ColStats if available
    long selKeyCardinality = selStats.getNumRows();
    long tsKeyCardinality = filStats.getNumRows();
    long tsRows = filStats.getNumRows();
    long tsRowSize = filStats.getAvgRowSize();
    long keyDomainCardinality = selKeyCardinality + tsKeyCardinality;
    ExprNodeColumnDesc selCol = ExprNodeDescUtils.getColumnExpr(selExpr);
    ExprNodeColumnDesc tsCol = ExprNodeDescUtils.getColumnExpr(tsExpr);
    if (selCol != null && tsCol != null) {
        // Check if there are column stats available for these columns
        ColStatistics selColStat = selStats.getColumnStatisticsFromColName(selCol.getColumn());
        ColStatistics filColStat = filStats.getColumnStatisticsFromColName(tsCol.getColumn());
        if (canUseNDV(selColStat)) {
            selKeyCardinality = selColStat.getCountDistint();
        }
        if (canUseNDV(filColStat)) {
            tsKeyCardinality = filColStat.getCountDistint();
        }
        // Get colstats for the original table column for selCol if possible, this would have
        // more accurate information about the original NDV of the column before any filtering.
        ColStatistics selColSourceStat = null;
        if (selColStat != null) {
            ExprNodeDescUtils.ColumnOrigin selColSource = ExprNodeDescUtils.findColumnOrigin(selCol, sel);
            if (selColSource != null && selColSource.op.getStatistics() != null) {
                selColSourceStat = selColSource.op.getStatistics().getColumnStatisticsFromColName(selColSource.col.getColumn());
            }
        }
        long domainCardinalityFromColStats = getCombinedKeyDomainCardinality(selColStat, selColSourceStat, filColStat);
        if (domainCardinalityFromColStats >= 0) {
            keyDomainCardinality = domainCardinalityFromColStats;
        }
    }
    // Selectivity: key cardinality of semijoin / domain cardinality
    // Benefit (rows filtered from ts): (1 - selectivity) * # ts rows
    double selectivity = selKeyCardinality / (double) keyDomainCardinality;
    selectivity = Math.min(selectivity, 1);
    benefit = tsRows * (1 - selectivity);
    if (LOG.isDebugEnabled()) {
        LOG.debug("BloomFilter benefit for " + selCol + " to " + tsCol + ", selKeyCardinality=" + selKeyCardinality + ", tsKeyCardinality=" + tsKeyCardinality + ", tsRows=" + tsRows + ", keyDomainCardinality=" + keyDomainCardinality);
        LOG.debug("SemiJoin key selectivity=" + selectivity + ", benefit=" + benefit);
    }
    return benefit;
}
Also used : ExprNodeColumnDesc(org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) ExprNodeDescUtils(org.apache.hadoop.hive.ql.plan.ExprNodeDescUtils) AnnotateWithStatistics(org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Aggregations

Statistics (org.apache.hadoop.hive.ql.plan.Statistics)15 ColStatistics (org.apache.hadoop.hive.ql.plan.ColStatistics)13 ArrayList (java.util.ArrayList)5 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)4 AnnotateWithStatistics (org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics)4 List (java.util.List)3 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)3 HashMap (java.util.HashMap)2 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)2 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)2 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)2 Operator (org.apache.hadoop.hive.ql.exec.Operator)2 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)2 Partition (org.apache.hadoop.hive.ql.metadata.Partition)2 ColumnStatsList (org.apache.hadoop.hive.ql.parse.ColumnStatsList)2 PrunedPartitionList (org.apache.hadoop.hive.ql.parse.PrunedPartitionList)2 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)2 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)2 OperatorStats (org.apache.hadoop.hive.ql.stats.OperatorStats)2 ImmutableMap (com.google.common.collect.ImmutableMap)1