Search in sources :

Example 31 with AggrStats

use of org.apache.hadoop.hive.metastore.api.AggrStats in project hive by apache.

the class StatsUtils method collectStatistics.

public static Statistics collectStatistics(HiveConf conf, PrunedPartitionList partList, Table table, List<ColumnInfo> schema, List<String> neededColumns, List<String> referencedColumns, boolean fetchColStats, boolean fetchPartStats) throws HiveException {
    Statistics stats = new Statistics();
    float deserFactor = HiveConf.getFloatVar(conf, HiveConf.ConfVars.HIVE_STATS_DESERIALIZATION_FACTOR);
    if (!table.isPartitioned()) {
        long ds = getDataSize(conf, table);
        long nr = getNumRows(conf, schema, neededColumns, table, ds);
        stats.setNumRows(nr);
        List<ColStatistics> colStats = Lists.newArrayList();
        if (fetchColStats) {
            colStats = getTableColumnStats(table, schema, neededColumns);
            long betterDS = getDataSizeFromColumnStats(nr, colStats);
            ds = (betterDS < 1 || colStats.isEmpty()) ? ds : betterDS;
        }
        stats.setDataSize(ds);
        // infer if any column can be primary key based on column statistics
        inferAndSetPrimaryKey(stats.getNumRows(), colStats);
        stats.setColumnStatsState(deriveStatType(colStats, neededColumns));
        stats.addToColumnStats(colStats);
    } else if (partList != null) {
        // For partitioned tables, get the size of all the partitions after pruning
        // the partitions that are not required
        long nr = 0;
        long ds = 0;
        List<Long> rowCounts = Lists.newArrayList();
        List<Long> dataSizes = Lists.newArrayList();
        if (fetchPartStats) {
            rowCounts = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.ROW_COUNT);
            dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.RAW_DATA_SIZE);
            nr = getSumIgnoreNegatives(rowCounts);
            ds = getSumIgnoreNegatives(dataSizes);
            if (ds <= 0) {
                dataSizes = getBasicStatForPartitions(table, partList.getNotDeniedPartns(), StatsSetupConst.TOTAL_SIZE);
                ds = getSumIgnoreNegatives(dataSizes);
            }
        }
        // sizes
        if (ds <= 0) {
            dataSizes = getFileSizeForPartitions(conf, partList.getNotDeniedPartns());
        }
        ds = getSumIgnoreNegatives(dataSizes);
        ds = (long) (ds * deserFactor);
        int avgRowSize = estimateRowSizeFromSchema(conf, schema, neededColumns);
        if (avgRowSize > 0) {
            setUnknownRcDsToAverage(rowCounts, dataSizes, avgRowSize);
            nr = getSumIgnoreNegatives(rowCounts);
            ds = getSumIgnoreNegatives(dataSizes);
            // number of rows -1 means that statistics from metastore is not reliable
            if (nr <= 0) {
                nr = ds / avgRowSize;
            }
        }
        if (nr == 0) {
            nr = 1;
        }
        stats.addToNumRows(nr);
        stats.addToDataSize(ds);
        // if at least a partition does not contain row count then mark basic stats state as PARTIAL
        if (containsNonPositives(rowCounts) && stats.getBasicStatsState().equals(State.COMPLETE)) {
            stats.setBasicStatsState(State.PARTIAL);
        }
        if (fetchColStats) {
            List<String> partNames = new ArrayList<String>(partList.getNotDeniedPartns().size());
            for (Partition part : partList.getNotDeniedPartns()) {
                partNames.add(part.getName());
            }
            neededColumns = processNeededColumns(schema, neededColumns);
            AggrStats aggrStats = null;
            // skip the step to connect to the metastore.
            if (neededColumns.size() > 0 && partNames.size() > 0) {
                aggrStats = Hive.get().getAggrColStatsFor(table.getDbName(), table.getTableName(), neededColumns, partNames);
            }
            if (null == aggrStats || null == aggrStats.getColStats() || aggrStats.getColStatsSize() == 0) {
                // There are some partitions with no state (or we didn't fetch any state).
                // Update the stats with empty list to reflect that in the
                // state/initialize structures.
                List<ColStatistics> emptyStats = Lists.newArrayList();
                // add partition column stats
                addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList, emptyStats);
                stats.addToColumnStats(emptyStats);
                stats.addToDataSize(getDataSizeFromColumnStats(nr, emptyStats));
                stats.updateColumnStatsState(deriveStatType(emptyStats, referencedColumns));
            } else {
                List<ColumnStatisticsObj> colStats = aggrStats.getColStats();
                if (colStats.size() != neededColumns.size()) {
                    LOG.debug("Column stats requested for : " + neededColumns.size() + " columns. Able to" + " retrieve for " + colStats.size() + " columns");
                }
                List<ColStatistics> columnStats = convertColStats(colStats, table.getTableName());
                addParitionColumnStats(conf, neededColumns, referencedColumns, schema, table, partList, columnStats);
                long betterDS = getDataSizeFromColumnStats(nr, columnStats);
                stats.setDataSize((betterDS < 1 || columnStats.isEmpty()) ? ds : betterDS);
                // infer if any column can be primary key based on column statistics
                inferAndSetPrimaryKey(stats.getNumRows(), columnStats);
                stats.addToColumnStats(columnStats);
                State colState = deriveStatType(columnStats, referencedColumns);
                if (aggrStats.getPartsFound() != partNames.size() && colState != State.NONE) {
                    LOG.debug("Column stats requested for : " + partNames.size() + " partitions. " + "Able to retrieve for " + aggrStats.getPartsFound() + " partitions");
                    colState = State.PARTIAL;
                }
                stats.setColumnStatsState(colState);
            }
        }
    }
    return stats;
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) State(org.apache.hadoop.hive.ql.plan.Statistics.State) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) List(java.util.List) ArrayList(java.util.ArrayList) Statistics(org.apache.hadoop.hive.ql.plan.Statistics) ColStatistics(org.apache.hadoop.hive.ql.plan.ColStatistics)

Example 32 with AggrStats

use of org.apache.hadoop.hive.metastore.api.AggrStats in project hive by apache.

the class TestHiveMetaStore method testStatsFastTrivial.

// Tests that in the absence of stats for partitions, and/or absence of columns
// to get stats for, the metastore does not break. See HIVE-12083 for motivation.
public void testStatsFastTrivial() throws Throwable {
    String dbName = "tstatsfast";
    String tblName = "t1";
    String tblOwner = "statstester";
    String typeName = "Person";
    int lastAccessed = 12083;
    cleanUp(dbName, tblName, typeName);
    List<List<String>> values = new ArrayList<List<String>>();
    values.add(makeVals("2008-07-01 14:13:12", "14"));
    values.add(makeVals("2008-07-01 14:13:12", "15"));
    values.add(makeVals("2008-07-02 14:13:12", "15"));
    values.add(makeVals("2008-07-03 14:13:12", "151"));
    createMultiPartitionTableSchema(dbName, tblName, typeName, values);
    List<String> emptyColNames = new ArrayList<String>();
    List<String> emptyPartNames = new ArrayList<String>();
    List<String> colNames = new ArrayList<String>();
    colNames.add("name");
    colNames.add("income");
    List<String> partNames = client.listPartitionNames(dbName, tblName, (short) -1);
    assertEquals(0, emptyColNames.size());
    assertEquals(0, emptyPartNames.size());
    assertEquals(2, colNames.size());
    assertEquals(4, partNames.size());
    // Test for both colNames and partNames being empty:
    AggrStats aggrStatsEmpty = client.getAggrColStatsFor(dbName, tblName, emptyColNames, emptyPartNames);
    // short-circuited on client-side, verifying that it's an empty object, not null
    assertNotNull(aggrStatsEmpty);
    assertEquals(0, aggrStatsEmpty.getPartsFound());
    assertNotNull(aggrStatsEmpty.getColStats());
    assert (aggrStatsEmpty.getColStats().isEmpty());
    // Test for only colNames being empty
    AggrStats aggrStatsOnlyParts = client.getAggrColStatsFor(dbName, tblName, emptyColNames, partNames);
    // short-circuited on client-side, verifying that it's an empty object, not null
    assertNotNull(aggrStatsOnlyParts);
    assertEquals(0, aggrStatsOnlyParts.getPartsFound());
    assertNotNull(aggrStatsOnlyParts.getColStats());
    assert (aggrStatsOnlyParts.getColStats().isEmpty());
    // Test for only partNames being empty
    AggrStats aggrStatsOnlyCols = client.getAggrColStatsFor(dbName, tblName, colNames, emptyPartNames);
    // short-circuited on client-side, verifying that it's an empty object, not null
    assertNotNull(aggrStatsOnlyCols);
    assertEquals(0, aggrStatsOnlyCols.getPartsFound());
    assertNotNull(aggrStatsOnlyCols.getColStats());
    assert (aggrStatsOnlyCols.getColStats().isEmpty());
    // Test for valid values for both.
    AggrStats aggrStatsFull = client.getAggrColStatsFor(dbName, tblName, colNames, partNames);
    assertNotNull(aggrStatsFull);
    // would still be empty, because no stats are actually populated.
    assertEquals(0, aggrStatsFull.getPartsFound());
    assertNotNull(aggrStatsFull.getColStats());
    assert (aggrStatsFull.getColStats().isEmpty());
}
Also used : AggrStats(org.apache.hadoop.hive.metastore.api.AggrStats) ArrayList(java.util.ArrayList) List(java.util.List) ArrayList(java.util.ArrayList)

Aggregations

AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)32 ArrayList (java.util.ArrayList)31 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)27 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)27 SerDeInfo (org.apache.hadoop.hive.metastore.api.SerDeInfo)26 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)26 Table (org.apache.hadoop.hive.metastore.api.Table)26 Test (org.junit.Test)26 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)25 Partition (org.apache.hadoop.hive.metastore.api.Partition)25 ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)24 ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)24 List (java.util.List)19 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)11 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)5 DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)5 IOException (java.io.IOException)2 DecimalColumnStatsData (org.apache.hadoop.hive.metastore.api.DecimalColumnStatsData)2 StringColumnStatsData (org.apache.hadoop.hive.metastore.api.StringColumnStatsData)2 Partition (org.apache.hadoop.hive.ql.metadata.Partition)2