Search in sources :

Example 26 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class StatsUtils method scaleColStatistics.

public static void scaleColStatistics(List<ColStatistics> colStats, double factor) {
    for (ColStatistics cs : colStats) {
        cs.setNumFalses(StatsUtils.safeMult(cs.getNumFalses(), factor));
        cs.setNumTrues(StatsUtils.safeMult(cs.getNumTrues(), factor));
        cs.setNumNulls(StatsUtils.safeMult(cs.getNumNulls(), factor));
        if (factor < 1.0) {
            final double newNDV = Math.ceil(cs.getCountDistint() * factor);
            cs.setCountDistint(newNDV > Long.MAX_VALUE ? Long.MAX_VALUE : (long) newNDV);
        }
    }
}
Also used : ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Example 27 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class StatsUtils method extractNDVGroupingColumns.

private static List<Long> extractNDVGroupingColumns(List<ColStatistics> colStats, Statistics parentStats) {
    List<Long> ndvValues = new ArrayList<>(colStats.size());
    // compute product of distinct values of grouping columns
    for (ColStatistics cs : colStats) {
        if (cs != null) {
            long ndv = cs.getCountDistint();
            if (cs.getNumNulls() > 0) {
                ndv = StatsUtils.safeAdd(ndv, 1);
            }
            ndvValues.add(ndv);
        } else {
            if (parentStats.getColumnStatsState().equals(Statistics.State.COMPLETE)) {
                // of NDVs
                continue;
            } else {
                // partial column statistics on grouping attributes case.
                // if column statistics on grouping attribute is missing, then
                // assume worst case.
                ndvValues = null;
            }
            break;
        }
    }
    return ndvValues;
}
Also used : ArrayList(java.util.ArrayList) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Example 28 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class ConvertJoinMapJoin method hashTableDataSizeAdjustment.

/**
 * In data calculation logic, we include some overhead due to java object refs, etc.
 * However, this overhead may be different when storing values in hashtable for mapjoin.
 * Hence, we calculate a size adjustment to the original data size for a given input.
 */
private static long hashTableDataSizeAdjustment(long numRows, List<ColStatistics> colStats) {
    long result = 0;
    if (numRows <= 0 || colStats == null || colStats.isEmpty()) {
        return result;
    }
    for (ColStatistics cs : colStats) {
        if (cs != null) {
            String colTypeLowerCase = cs.getColumnType().toLowerCase();
            long nonNullCount = cs.getNumNulls() > 0 ? numRows - cs.getNumNulls() + 1 : numRows;
            double overhead = 0;
            if (colTypeLowerCase.equals(serdeConstants.STRING_TYPE_NAME) || colTypeLowerCase.startsWith(serdeConstants.VARCHAR_TYPE_NAME) || colTypeLowerCase.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
                overhead = JavaDataModel.get().lengthForStringOfLength(0);
            } else if (colTypeLowerCase.equals(serdeConstants.BINARY_TYPE_NAME)) {
                overhead = JavaDataModel.get().lengthForByteArrayOfSize(0);
            } else if (colTypeLowerCase.equals(serdeConstants.TIMESTAMP_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.TIMESTAMPLOCALTZ_TYPE_NAME) || colTypeLowerCase.startsWith(serdeConstants.DECIMAL_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) {
                overhead = JavaDataModel.get().object();
            }
            result = StatsUtils.safeAdd(StatsUtils.safeMult(nonNullCount, overhead), result);
        }
    }
    return result;
}
Also used : ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Example 29 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class ConvertJoinMapJoin method checkNumberOfEntriesForHashTable.

/* Returns true if it passes the test, false otherwise. */
private boolean checkNumberOfEntriesForHashTable(JoinOperator joinOp, int position, OptimizeTezProcContext context) {
    long max = HiveConf.getLongVar(context.parseContext.getConf(), HiveConf.ConfVars.HIVECONVERTJOINMAXENTRIESHASHTABLE);
    if (max < 1) {
        // Max is disabled, we can safely return true
        return true;
    }
    // Calculate number of different entries and evaluate
    ReduceSinkOperator rsOp = (ReduceSinkOperator) joinOp.getParentOperators().get(position);
    List<String> keys = StatsUtils.getQualifedReducerKeyNames(rsOp.getConf().getOutputKeyColumnNames());
    Statistics inputStats = rsOp.getStatistics();
    List<ColStatistics> columnStats = new ArrayList<>();
    for (String key : keys) {
        ColStatistics cs = inputStats.getColumnStatisticsFromColName(key);
        if (cs == null) {
            return true;
        }
        columnStats.add(cs);
    }
    long numRows = inputStats.getNumRows();
    long estimation = estimateNDV(numRows, columnStats);
    LOG.debug("Estimated NDV for input {}: {}; Max NDV for MapJoin conversion: {}", position, estimation, max);
    if (estimation > max) {
        // Estimation larger than max
        LOG.debug("Number of different entries for HashTable is greater than the max; " + "we do not convert to MapJoin");
        return false;
    }
    // We can proceed with the conversion
    return true;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) ArrayList(java.util.ArrayList) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Example 30 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class ConvertJoinMapJoin method estimateNDV.

private static long estimateNDV(long numRows, List<ColStatistics> columnStats) {
    // If there is a single column, return the number of distinct values
    if (columnStats.size() == 1) {
        return columnStats.get(0).getCountDistint();
    }
    // The expected number of distinct values when choosing p values
    // with replacement from n integers is n . (1 - ((n - 1) / n) ^ p).
    // 
    // If we have several uniformly distributed attributes A1 ... Am
    // with N1 ... Nm distinct values, they behave as one uniformly
    // distributed attribute with N1 * ... * Nm distinct values.
    long n = 1L;
    for (ColStatistics cs : columnStats) {
        final long ndv = cs.getCountDistint();
        if (ndv > 1) {
            n = StatsUtils.safeMult(n, ndv);
        }
    }
    final double nn = n;
    final double a = (nn - 1d) / nn;
    if (a == 1d) {
        // A under-flows if nn is large.
        return numRows;
    }
    final double v = nn * (1d - Math.pow(a, numRows));
    // to go a few % over.
    return Math.min(Math.round(v), numRows);
}
Also used : ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Aggregations

ColStatistics (org.apache.hadoop.hive.ql.plan.ColStatistics)42 ArrayList (java.util.ArrayList)14 Statistics (org.apache.hadoop.hive.ql.plan.Statistics)8 HashSet (java.util.HashSet)5 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)5 HashMap (java.util.HashMap)4 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)4 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)4 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)4 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)4 List (java.util.List)3 ImmutableBitSet (org.apache.calcite.util.ImmutableBitSet)3 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)3 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)3 RelMetadataQuery (org.apache.calcite.rel.metadata.RelMetadataQuery)2 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)2 Partition (org.apache.hadoop.hive.ql.metadata.Partition)2 PartitionIterable (org.apache.hadoop.hive.ql.metadata.PartitionIterable)2 AnnotateWithStatistics (org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics)2 ColumnStatsList (org.apache.hadoop.hive.ql.parse.ColumnStatsList)2