Search in sources :

Example 11 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class StatsRulesProcFactory method updateStats.

static void updateStats(Statistics stats, long newNumRows, boolean useColStats, Operator<? extends OperatorDesc> op, boolean updateNDV) {
    if (newNumRows < 0) {
        LOG.debug("STATS-" + op.toString() + ": Overflow in number of rows. " + newNumRows + " rows will be set to Long.MAX_VALUE");
        newNumRows = StatsUtils.getMaxIfOverflow(newNumRows);
    }
    if (newNumRows == 0) {
        LOG.debug("STATS-" + op.toString() + ": Equals 0 in number of rows. " + newNumRows + " rows will be set to 1");
        newNumRows = 1;
    }
    long oldRowCount = stats.getNumRows();
    double ratio = (double) newNumRows / (double) oldRowCount;
    stats.setNumRows(newNumRows);
    if (useColStats) {
        List<ColStatistics> colStats = stats.getColumnStats();
        for (ColStatistics cs : colStats) {
            long oldNumNulls = cs.getNumNulls();
            long oldDV = cs.getCountDistint();
            long newNumNulls = Math.round(ratio * oldNumNulls);
            cs.setNumNulls(newNumNulls);
            if (updateNDV) {
                long newDV = oldDV;
                // the output number of rows is less than input number of rows.
                if (ratio <= 1.0) {
                    newDV = (long) Math.ceil(ratio * oldDV);
                }
                cs.setCountDistint(newDV);
            }
        }
        stats.setColumnStats(colStats);
        long newDataSize = StatsUtils.getDataSizeFromColumnStats(newNumRows, colStats);
        stats.setDataSize(StatsUtils.getMaxIfOverflow(newDataSize));
    } else {
        long newDataSize = (long) (ratio * stats.getDataSize());
        stats.setDataSize(StatsUtils.getMaxIfOverflow(newDataSize));
    }
}
Also used : ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Example 12 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class HiveRelMdSize method averageColumnSizes.

//~ Methods ----------------------------------------------------------------
public List<Double> averageColumnSizes(HiveTableScan scan, RelMetadataQuery mq) {
    List<Integer> neededcolsLst = scan.getNeededColIndxsFrmReloptHT();
    List<ColStatistics> columnStatistics = ((RelOptHiveTable) scan.getTable()).getColStat(neededcolsLst, true);
    // Obtain list of col stats, or use default if they are not available
    final ImmutableList.Builder<Double> list = ImmutableList.builder();
    int indxRqdCol = 0;
    int nFields = scan.getRowType().getFieldCount();
    for (int i = 0; i < nFields; i++) {
        if (neededcolsLst.contains(i)) {
            ColStatistics columnStatistic = columnStatistics.get(indxRqdCol);
            indxRqdCol++;
            if (columnStatistic == null) {
                RelDataTypeField field = scan.getRowType().getFieldList().get(i);
                list.add(averageTypeValueSize(field.getType()));
            } else {
                list.add(columnStatistic.getAvgColLen());
            }
        } else {
            list.add(new Double(0));
        }
    }
    return list.build();
}
Also used : RelDataTypeField(org.apache.calcite.rel.type.RelDataTypeField) RelOptHiveTable(org.apache.hadoop.hive.ql.optimizer.calcite.RelOptHiveTable) ImmutableList(com.google.common.collect.ImmutableList) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Example 13 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class HiveRelMdUniqueKeys method getUniqueKeys.

/*
   * Infer Uniquenes if: - rowCount(col) = ndv(col) - TBD for numerics: max(col)
   * - min(col) = rowCount(col)
   * 
   * Why are we intercepting Project and not TableScan? Because if we
   * have a method for TableScan, it will not know which columns to check for.
   * Inferring Uniqueness for all columns is very expensive right now. The flip
   * side of doing this is, it only works post Field Trimming.
   */
public Set<ImmutableBitSet> getUniqueKeys(Project rel, RelMetadataQuery mq, boolean ignoreNulls) {
    HiveTableScan tScan = getTableScan(rel.getInput(), false);
    if (tScan == null) {
        // If HiveTableScan is not found, e.g., not sequence of Project and
        // Filter operators, execute the original getUniqueKeys method
        // LogicalProject maps a set of rows to a different set;
        // Without knowledge of the mapping function(whether it
        // preserves uniqueness), it is only safe to derive uniqueness
        // info from the child of a project when the mapping is f(a) => a.
        //
        // Further more, the unique bitset coming from the child needs
        // to be mapped to match the output of the project.
        final Map<Integer, Integer> mapInToOutPos = new HashMap<>();
        final List<RexNode> projExprs = rel.getProjects();
        final Set<ImmutableBitSet> projUniqueKeySet = new HashSet<>();
        // Build an input to output position map.
        for (int i = 0; i < projExprs.size(); i++) {
            RexNode projExpr = projExprs.get(i);
            if (projExpr instanceof RexInputRef) {
                mapInToOutPos.put(((RexInputRef) projExpr).getIndex(), i);
            }
        }
        if (mapInToOutPos.isEmpty()) {
            // return empty set.
            return projUniqueKeySet;
        }
        Set<ImmutableBitSet> childUniqueKeySet = mq.getUniqueKeys(rel.getInput(), ignoreNulls);
        if (childUniqueKeySet != null) {
            // projected.
            for (ImmutableBitSet colMask : childUniqueKeySet) {
                ImmutableBitSet.Builder tmpMask = ImmutableBitSet.builder();
                boolean completeKeyProjected = true;
                for (int bit : colMask) {
                    if (mapInToOutPos.containsKey(bit)) {
                        tmpMask.set(mapInToOutPos.get(bit));
                    } else {
                        // Skip the child unique key if part of it is not
                        // projected.
                        completeKeyProjected = false;
                        break;
                    }
                }
                if (completeKeyProjected) {
                    projUniqueKeySet.add(tmpMask.build());
                }
            }
        }
        return projUniqueKeySet;
    }
    Map<Integer, Integer> posMap = new HashMap<Integer, Integer>();
    int projectPos = 0;
    int colStatsPos = 0;
    BitSet projectedCols = new BitSet();
    for (RexNode r : rel.getProjects()) {
        if (r instanceof RexInputRef) {
            projectedCols.set(((RexInputRef) r).getIndex());
            posMap.put(colStatsPos, projectPos);
            colStatsPos++;
        }
        projectPos++;
    }
    double numRows = tScan.getRows();
    List<ColStatistics> colStats = tScan.getColStat(BitSets.toList(projectedCols));
    Set<ImmutableBitSet> keys = new HashSet<ImmutableBitSet>();
    colStatsPos = 0;
    for (ColStatistics cStat : colStats) {
        boolean isKey = false;
        if (cStat.getCountDistint() >= numRows) {
            isKey = true;
        }
        if (!isKey && cStat.getRange() != null && cStat.getRange().maxValue != null && cStat.getRange().minValue != null) {
            double r = cStat.getRange().maxValue.doubleValue() - cStat.getRange().minValue.doubleValue() + 1;
            isKey = (Math.abs(numRows - r) < RelOptUtil.EPSILON);
        }
        if (isKey) {
            ImmutableBitSet key = ImmutableBitSet.of(posMap.get(colStatsPos));
            keys.add(key);
        }
        colStatsPos++;
    }
    return keys;
}
Also used : ImmutableBitSet(org.apache.calcite.util.ImmutableBitSet) HashMap(java.util.HashMap) ImmutableBitSet(org.apache.calcite.util.ImmutableBitSet) BitSet(java.util.BitSet) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) RexInputRef(org.apache.calcite.rex.RexInputRef) HiveTableScan(org.apache.hadoop.hive.ql.optimizer.calcite.reloperators.HiveTableScan) RexNode(org.apache.calcite.rex.RexNode) HashSet(java.util.HashSet)

Example 14 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class HiveRelMdDistinctRowCount method getDistinctRowCount.

private Double getDistinctRowCount(HiveTableScan htRel, RelMetadataQuery mq, ImmutableBitSet groupKey, RexNode predicate) {
    List<Integer> projIndxLst = HiveCalciteUtil.translateBitSetToProjIndx(groupKey);
    List<ColStatistics> colStats = htRel.getColStat(projIndxLst);
    Double noDistinctRows = 1.0;
    for (ColStatistics cStat : colStats) {
        noDistinctRows *= cStat.getCountDistint();
    }
    return Math.min(noDistinctRows, htRel.getRows());
}
Also used : ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Example 15 with ColStatistics

use of org.apache.hadoop.hive.ql.plan.ColStatistics in project hive by apache.

the class StatsUtils method collectStatistics.

public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, List<ColumnInfo> schema, List<String> neededColumns, List<String> referencedColumns, boolean fetchColStats, boolean fetchPartStats) throws HiveException {
    Statistics stats = new Statistics();
    float deserFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR);
    if (!table.isPartitioned()) {
        long ds = getDataSize(conf, table);
        long nr = getNumRows(conf, schema, neededColumns, table, ds);
        stats.setNumRows(nr);
        List<ColStatistics> colStats = Lists.newArrayList();
        if (fetchColStats) {
            colStats = getTableColumnStats(table, schema, neededColumns);
            long betterDS = getDataSizeFromColumnStats(nr, colStats);
            ds = (betterDS < 1 || colStats.isEmpty()) ? ds : betterDS;
        }
        stats.setDataSize(ds);
        // infer if any column can be primary key based on column statistics
        inferAndSetPrimaryKey(stats.getNumRows(), colStats);
        stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
        stats.addToColumnStats(colStats);
    } else if (partList != null) {
        // For partitioned tables, get the size of all the partitions after pruning
        // the partitions that are not required
        long nr = 0;
        long ds = 0;
        List<Long> rowCounts = Lists.newArrayList();
        List<Long> dataSizes = Lists.newArrayList();
        if (fetchPartStats) {
            rowCounts = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT);
            dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.RAW_DATA_SIZE);
            nr = getSumIgnoreNegatives(rowCounts);
            ds = getSumIgnoreNegatives(dataSizes);
            if (ds <= 0) {
                dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.TOTAL_SIZE);
                ds = getSumIgnoreNegatives(dataSizes);
            }
        }
        // sizes
        if (ds <= 0) {
            dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns());
        }
        ds = getSumIgnoreNegatives(dataSizes);
        ds = (long) (ds * deserFactor);
        int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns);
        if (avgRowSize > 0) {
            setUnknownRcDsToAverage(rowCounts, dataSizes, avgRowSize);
            nr = getSumIgnoreNegatives(rowCounts);
            ds = getSumIgnoreNegatives(dataSizes);
            // number of rows -1 means that statistics from metastore is not reliable
            if (nr <= 0) {
                nr = ds / avgRowSize;
            }
        }
        if (nr == 0) {
            nr = 1;
        }
        stats.addToNumRows(nr);
        stats.addToDataSize(ds);
        // if at least a partition does not contain row count then mark basic stats state as PARTIAL
        if (containsNonPositives(rowCounts) && stats.getBasicStatsState().equals(State.COMPLETE)) {
            stats.setBasicStatsState(State.PARTIAL);
        }
        if (fetchColStats) {
            List<String> partNames = new ArrayList<String>(partList.getNotDeniedPartns().size());
            for (Partition part : partList.getNotDeniedPartns()) {
                partNames.add(part.getName());
            }
            neededColumns = processNeededColumns(schema, neededColumns);
            AggrStats aggrStats = null;
            // skip the step to connect to the metastore.
            if (neededColumns.size() > 0 && partNames.size() > 0) {
                aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(), neededColumns, partNames);
            }
            if (null == aggrStats || null == aggrStats.getColStats() || aggrStats.getColStatsSize() == 0) {
                // There are some partitions with no state (or we didn't fetch any state).
                // Update the stats with empty list to reflect that in the
                // state/initialize structures.
                List<ColStatistics> emptyStats = Lists.newArrayList();
                // add partition column stats
                addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList, emptyStats);
                stats.addToColumnStats(emptyStats);
                stats.addToDataSize(getDataSizeFromColumnStats(nr, emptyStats));
                stats.updateColumnStatsState(deriveStatType(emptyStats, referencedColumns));
            } else {
                List<ColumnStatisticsObj> colStats = aggrStats.getColStats();
                if (colStats.size() != neededColumns.size()) {
                    LOG.debug("Column stats requested for : " + neededColumns.size() + " columns. Able to" + " retrieve for " + colStats.size() + " columns");
                }
                List<ColStatistics> columnStats = convertColStats(colStats, table.getTableName());
                addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList, columnStats);
                long betterDS = getDataSizeFromColumnStats(nr, columnStats);
                stats.setDataSize((betterDS < 1 || columnStats.isEmpty()) ? ds : betterDS);
                // infer if any column can be primary key based on column statistics
                inferAndSetPrimaryKey(stats.getNumRows(), columnStats);
                stats.addToColumnStats(columnStats);
                State colState = deriveStatType(columnStats, referencedColumns);
                if (aggrStats.getPartsFound() != partNames.size() && colState != State.NONE) {
                    LOG.debug("Column stats requested for : " + partNames.size() + " partitions. " + "Able to retrieve for " + aggrStats.getPartsFound() + " partitions");
                    colState = State.PARTIAL;
                }
                stats.setColumnStatsState(colState);
            }
        }
    }
    return stats;
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) State(org.apache.hadoop.hive.ql.plan.Statistics.State) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) List(java.util.List) ArrayList(java.util.ArrayList) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Aggregations

ColStatistics (org.apache.hadoop.hive.ql.plan.ColStatistics)20 ArrayList (java.util.ArrayList)6 Statistics (org.apache.hadoop.hive.ql.plan.Statistics)4 HashSet (java.util.HashSet)3 HashMap (java.util.HashMap)2 List (java.util.List)2 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)2 ImmutableBitSet (org.apache.calcite.util.ImmutableBitSet)2 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)2 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)2 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)2 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)2 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)2 Partition (org.apache.hadoop.hive.ql.metadata.Partition)2 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)2 ImmutableList (com.google.common.collect.ImmutableList)1 ImmutableMap (com.google.common.collect.ImmutableMap)1 DataOutputStream (java.io.DataOutputStream)1 BigDecimal (java.math.BigDecimal)1 BigInteger (java.math.BigInteger)1