Search in sources :

Example 21 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class DescTableOperation method getColumnDataForPartitionKeyColumn.

private void getColumnDataForPartitionKeyColumn(Table table, List<FieldSchema> cols, List<ColumnStatisticsObj> colStats, List<String> colNames, Map<String, String> tableProps) throws HiveException, MetaException {
    FieldSchema partCol = table.getPartColByName(colNames.get(0));
    cols.add(partCol);
    PartitionIterable parts = new PartitionIterable(context.getDb(), table, null, MetastoreConf.getIntVar(context.getConf(), MetastoreConf.ConfVars.BATCH_RETRIEVE_MAX));
    ColumnInfo ci = new ColumnInfo(partCol.getName(), TypeInfoUtils.getTypeInfoFromTypeString(partCol.getType()), null, false);
    ColStatistics cs = StatsUtils.getColStatsForPartCol(ci, parts, context.getConf());
    ColumnStatisticsData data = new ColumnStatisticsData();
    ColStatistics.Range r = cs.getRange();
    StatObjectConverter.fillColumnStatisticsData(partCol.getType(), data, r == null ? null : r.minValue, r == null ? null : r.maxValue, r == null ? null : r.minValue, r == null ? null : r.maxValue, r == null ? null : r.minValue.toString(), r == null ? null : r.maxValue.toString(), cs.getNumNulls(), cs.getCountDistint(), null, cs.getAvgColLen(), cs.getAvgColLen(), cs.getNumTrues(), cs.getNumFalses());
    ColumnStatisticsObj cso = new ColumnStatisticsObj(partCol.getName(), partCol.getType(), data);
    colStats.add(cso);
    StatsSetupConst.setColumnStatsState(tableProps, colNames);
}
Also used : ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) PartitionIterable(org.apache.hadoop.hive.ql.metadata.PartitionIterable) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)

Example 22 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class ConvertJoinMapJoin method estimateNDV.

private static long estimateNDV(long numRows, List<ColStatistics> columnStats) {
    // If there is a single column, return the number of distinct values
    if (columnStats.size() == 1) {
        return columnStats.get(0).getCountDistint();
    }
    // The expected number of distinct values when choosing p values
    // with replacement from n integers is n . (1 - ((n - 1) / n) ^ p).
    // 
    // If we have several uniformly distributed attributes A1 ... Am
    // with N1 ... Nm distinct values, they behave as one uniformly
    // distributed attribute with N1 * ... * Nm distinct values.
    long n = 1L;
    for (ColStatistics cs : columnStats) {
        final long ndv = cs.getCountDistint();
        if (ndv > 1) {
            n = StatsUtils.safeMult(n, ndv);
        }
    }
    final double nn = n;
    final double a = (nn - 1d) / nn;
    if (a == 1d) {
        // A under-flows if nn is large.
        return numRows;
    }
    final double v = nn * (1d - Math.pow(a, numRows));
    // to go a few % over.
    return Math.min(Math.round(v), numRows);
}
Also used : ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Example 23 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class ConvertJoinMapJoin method checkNumberOfEntriesForHashTable.

/* Returns true if it passes the test, false otherwise. */
private boolean checkNumberOfEntriesForHashTable(JoinOperator joinOp, int position, OptimizeTezProcContext context) {
    long max = HiveConf.getLongVar(context.parseContext.getConf(), HiveConf.ConfVars.HIVECONVERTJOINMAXENTRIESHASHTABLE);
    if (max < 1) {
        // Max is disabled, we can safely return true
        return true;
    }
    // Calculate number of different entries and evaluate
    ReduceSinkOperator rsOp = (ReduceSinkOperator) joinOp.getParentOperators().get(position);
    List<String> keys = StatsUtils.getQualifedReducerKeyNames(rsOp.getConf().getOutputKeyColumnNames());
    Statistics inputStats = rsOp.getStatistics();
    List<ColStatistics> columnStats = new ArrayList<>();
    for (String key : keys) {
        ColStatistics cs = inputStats.getColumnStatisticsFromColName(key);
        if (cs == null) {
            return true;
        }
        columnStats.add(cs);
    }
    long numRows = inputStats.getNumRows();
    long estimation = estimateNDV(numRows, columnStats);
    LOG.debug("Estimated NDV for input {}: {}; Max NDV for MapJoin conversion: {}", position, estimation, max);
    if (estimation > max) {
        // Estimation larger than max
        LOG.debug("Number of different entries for HashTable is greater than the max; " + "we do not convert to MapJoin");
        return false;
    }
    // We can proceed with the conversion
    return true;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) ArrayList(java.util.ArrayList) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Example 24 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class ConvertJoinMapJoin method hashTableDataSizeAdjustment.

/**
 * In data calculation logic, we include some overhead due to java object refs, etc.
 * However, this overhead may be different when storing values in hashtable for mapjoin.
 * Hence, we calculate a size adjustment to the original data size for a given input.
 */
private static long hashTableDataSizeAdjustment(long numRows, List<ColStatistics> colStats) {
    long result = 0;
    if (numRows <= 0 || colStats == null || colStats.isEmpty()) {
        return result;
    }
    for (ColStatistics cs : colStats) {
        if (cs != null) {
            String colTypeLowerCase = cs.getColumnType().toLowerCase();
            long nonNullCount = cs.getNumNulls() > 0 ? numRows - cs.getNumNulls() + 1 : numRows;
            double overhead = 0;
            if (colTypeLowerCase.equals(serdeConstants.STRING_TYPE_NAME) || colTypeLowerCase.startsWith(serdeConstants.VARCHAR_TYPE_NAME) || colTypeLowerCase.startsWith(serdeConstants.CHAR_TYPE_NAME)) {
                overhead = JavaDataModel.get().lengthForStringOfLength(0);
            } else if (colTypeLowerCase.equals(serdeConstants.BINARY_TYPE_NAME)) {
                overhead = JavaDataModel.get().lengthForByteArrayOfSize(0);
            } else if (colTypeLowerCase.equals(serdeConstants.TIMESTAMP_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.TIMESTAMPLOCALTZ_TYPE_NAME) || colTypeLowerCase.startsWith(serdeConstants.DECIMAL_TYPE_NAME) || colTypeLowerCase.equals(serdeConstants.DATE_TYPE_NAME)) {
                overhead = JavaDataModel.get().object();
            }
            result = StatsUtils.safeAdd(StatsUtils.safeMult(nonNullCount, overhead), result);
        }
    }
    return result;
}
Also used : ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Example 25 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class HiveRelMdSize method averageColumnSizes.

// ~ Methods ----------------------------------------------------------------
public List<Double> averageColumnSizes(HiveTableScan scan, RelMetadataQuery mq) {
    List<Integer> neededcolsLst = scan.getNeededColIndxsFrmReloptHT();
    List<ColStatistics> columnStatistics = ((RelOptHiveTable) scan.getTable()).getColStat(neededcolsLst, true);
    // Obtain list of col stats, or use default if they are not available
    final ImmutableList.Builder<Double> list = ImmutableList.builder();
    int indxRqdCol = 0;
    int nNoVirtualColumns = ((RelOptHiveTable) scan.getTable()).getNoOfNonVirtualCols();
    int nFields = scan.getRowType().getFieldCount();
    for (int i = 0; i < nNoVirtualColumns; i++) {
        if (neededcolsLst.contains(i)) {
            ColStatistics columnStatistic = columnStatistics.get(indxRqdCol);
            indxRqdCol++;
            if (columnStatistic == null) {
                RelDataTypeField field = scan.getRowType().getFieldList().get(i);
                list.add(averageTypeValueSize(field.getType()));
            } else {
                list.add(columnStatistic.getAvgColLen());
            }
        } else {
            list.add(Double.valueOf(0));
        }
    }
    for (int i = nNoVirtualColumns; i < nFields; i++) {
        if (neededcolsLst.contains(i)) {
            RelDataTypeField field = scan.getRowType().getFieldList().get(i);
            list.add(averageTypeValueSize(field.getType()));
        } else {
            list.add(Double.valueOf(0));
        }
    }
    return list.build();
}
Also used : RelDataTypeField(org.apache.calcite.rel.type.RelDataTypeField) RelOptHiveTable(org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable) ImmutableList(com.google.common.collect.ImmutableList) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Aggregations

ColStatistics (org.apache.hadoop.hive.ql.plan.ColStatistics)42 ArrayList (java.util.ArrayList)14 Statistics (org.apache.hadoop.hive.ql.plan.Statistics)8 HashSet (java.util.HashSet)5 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)5 HashMap (java.util.HashMap)4 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)4 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)4 ExprNodeColumnDesc (org.apache.hadoop.hive.ql.plan.ExprNodeColumnDesc)4 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)4 List (java.util.List)3 ImmutableBitSet (org.apache.calcite.util.ImmutableBitSet)3 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)3 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)3 RelMetadataQuery (org.apache.calcite.rel.metadata.RelMetadataQuery)2 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)2 Partition (org.apache.hadoop.hive.ql.metadata.Partition)2 PartitionIterable (org.apache.hadoop.hive.ql.metadata.PartitionIterable)2 AnnotateWithStatistics (org.apache.hadoop.hive.ql.optimizer.stats.annotation.AnnotateWithStatistics)2 ColumnStatsList (org.apache.hadoop.hive.ql.parse.ColumnStatsList)2