Search in sources :

Example 1 with State

use of org.apache.hadoop.hive.ql.plan.Statistics.State in project hive by apache.

the class StatsUtils method deriveStatType.

private static Statistics.State deriveStatType(List<ColStatistics> colStats, List<String> neededColumns) {
    boolean hasStats = false, hasNull = (colStats == null) || (colStats.size() < neededColumns.size());
    if (colStats != null) {
        for (ColStatistics cs : colStats) {
            // either colstats is null or is estimated
            boolean isNull = (cs == null) ? true : (cs.isEstimated());
            hasStats |= !isNull;
            hasNull |= isNull;
            if (hasNull && hasStats) {
                break;
            }
        }
    }
    State result = (hasStats ? (hasNull ? Statistics.State.PARTIAL : Statistics.State.COMPLETE) : (neededColumns.isEmpty() ? Statistics.State.COMPLETE : Statistics.State.NONE));
    return result;
}
Also used : State(org.apache.hadoop.hive.ql.plan.Statistics.State) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Example 2 with State

use of org.apache.hadoop.hive.ql.plan.Statistics.State in project hive by apache.

the class StatsUtils method collectStatistics.

public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, List<ColumnInfo> schema, List<String> neededColumns, List<String> referencedColumns, boolean fetchColStats, boolean fetchPartStats) throws HiveException {
    Statistics stats = new Statistics();
    float deserFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR);
    if (!table.isPartitioned()) {
        long ds = getDataSize(conf, table);
        long nr = getNumRows(conf, schema, neededColumns, table, ds);
        stats.setNumRows(nr);
        List<ColStatistics> colStats = Lists.newArrayList();
        if (fetchColStats) {
            colStats = getTableColumnStats(table, schema, neededColumns);
            long betterDS = getDataSizeFromColumnStats(nr, colStats);
            ds = (betterDS < 1 || colStats.isEmpty()) ? ds : betterDS;
        }
        stats.setDataSize(ds);
        // infer if any column can be primary key based on column statistics
        inferAndSetPrimaryKey(stats.getNumRows(), colStats);
        stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
        stats.addToColumnStats(colStats);
    } else if (partList != null) {
        // For partitioned tables, get the size of all the partitions after pruning
        // the partitions that are not required
        long nr = 0;
        long ds = 0;
        List<Long> rowCounts = Lists.newArrayList();
        List<Long> dataSizes = Lists.newArrayList();
        if (fetchPartStats) {
            rowCounts = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT);
            dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.RAW_DATA_SIZE);
            nr = getSumIgnoreNegatives(rowCounts);
            ds = getSumIgnoreNegatives(dataSizes);
            if (ds <= 0) {
                dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.TOTAL_SIZE);
                ds = getSumIgnoreNegatives(dataSizes);
            }
        }
        // sizes
        if (ds <= 0) {
            dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns());
        }
        ds = getSumIgnoreNegatives(dataSizes);
        ds = (long) (ds * deserFactor);
        int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns);
        if (avgRowSize > 0) {
            setUnknownRcDsToAverage(rowCounts, dataSizes, avgRowSize);
            nr = getSumIgnoreNegatives(rowCounts);
            ds = getSumIgnoreNegatives(dataSizes);
            // number of rows -1 means that statistics from metastore is not reliable
            if (nr <= 0) {
                nr = ds / avgRowSize;
            }
        }
        if (nr == 0) {
            nr = 1;
        }
        stats.addToNumRows(nr);
        stats.addToDataSize(ds);
        // if at least a partition does not contain row count then mark basic stats state as PARTIAL
        if (containsNonPositives(rowCounts) && stats.getBasicStatsState().equals(State.COMPLETE)) {
            stats.setBasicStatsState(State.PARTIAL);
        }
        if (fetchColStats) {
            List<String> partNames = new ArrayList<String>(partList.getNotDeniedPartns().size());
            for (Partition part : partList.getNotDeniedPartns()) {
                partNames.add(part.getName());
            }
            neededColumns = processNeededColumns(schema, neededColumns);
            AggrStats aggrStats = null;
            // skip the step to connect to the metastore.
            if (neededColumns.size() > 0 && partNames.size() > 0) {
                aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(), neededColumns, partNames);
            }
            if (null == aggrStats || null == aggrStats.getColStats() || aggrStats.getColStatsSize() == 0) {
                // There are some partitions with no state (or we didn't fetch any state).
                // Update the stats with empty list to reflect that in the
                // state/initialize structures.
                List<ColStatistics> emptyStats = Lists.newArrayList();
                // add partition column stats
                addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList, emptyStats);
                stats.addToColumnStats(emptyStats);
                stats.addToDataSize(getDataSizeFromColumnStats(nr, emptyStats));
                stats.updateColumnStatsState(deriveStatType(emptyStats, referencedColumns));
            } else {
                List<ColumnStatisticsObj> colStats = aggrStats.getColStats();
                if (colStats.size() != neededColumns.size()) {
                    LOG.debug("Column stats requested for : " + neededColumns.size() + " columns. Able to" + " retrieve for " + colStats.size() + " columns");
                }
                List<ColStatistics> columnStats = convertColStats(colStats, table.getTableName());
                addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList, columnStats);
                long betterDS = getDataSizeFromColumnStats(nr, columnStats);
                stats.setDataSize((betterDS < 1 || columnStats.isEmpty()) ? ds : betterDS);
                // infer if any column can be primary key based on column statistics
                inferAndSetPrimaryKey(stats.getNumRows(), columnStats);
                stats.addToColumnStats(columnStats);
                State colState = deriveStatType(columnStats, referencedColumns);
                if (aggrStats.getPartsFound() != partNames.size() && colState != State.NONE) {
                    LOG.debug("Column stats requested for : " + partNames.size() + " partitions. " + "Able to retrieve for " + aggrStats.getPartsFound() + " partitions");
                    colState = State.PARTIAL;
                }
                stats.setColumnStatsState(colState);
            }
        }
    }
    return stats;
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) State(org.apache.hadoop.hive.ql.plan.Statistics.State) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) List(java.util.List) ArrayList(java.util.ArrayList) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Aggregations

ColStatistics (org.apache.hadoop.hive.ql.plan.ColStatistics)2 State (org.apache.hadoop.hive.ql.plan.Statistics.State)2 ArrayList (java.util.ArrayList)1 List (java.util.List)1 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)1 Partition (org.apache.hadoop.hive.ql.metadata.Partition)1 PrunedPartitionList (org.apache.hadoop.hive.ql.parse.PrunedPartitionList)1 Statistics (org.apache.hadoop.hive.ql.plan.Statistics)1