Search in sources :

Example 76 with ColumnStatistics

use of org.apache.hadoop.hive.metastore.api.ColumnStatistics in project hive by apache.

the class TestHBaseStore method stringTableStatistics.

@Test
public void stringTableStatistics() throws Exception {
    createMockTable(STRING_TYPE);
    // Add a string table stats for STRING_COL to DB
    // Because of the way our mock implementation works we actually need to not create the table
    // before we set statistics on it.
    ColumnStatistics stats = new ColumnStatistics();
    // Get a default ColumnStatisticsDesc for table level stats
    ColumnStatisticsDesc desc = getMockTblColStatsDesc();
    stats.setStatsDesc(desc);
    // Get one of the pre-created ColumnStatisticsObj
    ColumnStatisticsObj obj = stringColStatsObjs.get(0);
    StringColumnStatsData stringData = obj.getStatsData().getStringStats();
    // Add to DB
    stats.addToStatsObj(obj);
    store.updateTableColumnStatistics(stats);
    // Get from DB
    ColumnStatistics statsFromDB = store.getTableColumnStatistics(DB, TBL, Arrays.asList(STRING_COL));
    // Compare ColumnStatisticsDesc
    Assert.assertEquals(desc.getLastAnalyzed(), statsFromDB.getStatsDesc().getLastAnalyzed());
    Assert.assertEquals(DB, statsFromDB.getStatsDesc().getDbName());
    Assert.assertEquals(TBL, statsFromDB.getStatsDesc().getTableName());
    Assert.assertTrue(statsFromDB.getStatsDesc().isIsTblLevel());
    // Compare ColumnStatisticsObj
    Assert.assertEquals(1, statsFromDB.getStatsObjSize());
    ColumnStatisticsObj objFromDB = statsFromDB.getStatsObj().get(0);
    ColumnStatisticsData dataFromDB = objFromDB.getStatsData();
    // Compare ColumnStatisticsData
    Assert.assertEquals(ColumnStatisticsData._Fields.STRING_STATS, dataFromDB.getSetField());
    // Compare StringColumnStatsData
    StringColumnStatsData stringDataFromDB = dataFromDB.getStringStats();
    Assert.assertEquals(stringData.getMaxColLen(), stringDataFromDB.getMaxColLen());
    Assert.assertEquals(stringData.getAvgColLen(), stringDataFromDB.getAvgColLen(), 0.01);
    Assert.assertEquals(stringData.getNumNulls(), stringDataFromDB.getNumNulls());
    Assert.assertEquals(stringData.getNumDVs(), stringDataFromDB.getNumDVs());
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) StringColumnStatsData(org.apache.hadoop.hive.metastore.api.StringColumnStatsData) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) Test(org.junit.Test)

Example 77 with ColumnStatistics

use of org.apache.hadoop.hive.metastore.api.ColumnStatistics in project hive by apache.

the class TestHBaseStore method doubleTableStatistics.

@Test
public void doubleTableStatistics() throws Exception {
    createMockTable(DOUBLE_TYPE);
    // Add a double table stats for DOUBLE_COL to DB
    // Because of the way our mock implementation works we actually need to not create the table
    // before we set statistics on it.
    ColumnStatistics stats = new ColumnStatistics();
    // Get a default ColumnStatisticsDesc for table level stats
    ColumnStatisticsDesc desc = getMockTblColStatsDesc();
    stats.setStatsDesc(desc);
    // Get one of the pre-created ColumnStatisticsObj
    ColumnStatisticsObj obj = doubleColStatsObjs.get(0);
    DoubleColumnStatsData doubleData = obj.getStatsData().getDoubleStats();
    // Add to DB
    stats.addToStatsObj(obj);
    store.updateTableColumnStatistics(stats);
    // Get from DB
    ColumnStatistics statsFromDB = store.getTableColumnStatistics(DB, TBL, Arrays.asList(DOUBLE_COL));
    // Compare ColumnStatisticsDesc
    Assert.assertEquals(desc.getLastAnalyzed(), statsFromDB.getStatsDesc().getLastAnalyzed());
    Assert.assertEquals(DB, statsFromDB.getStatsDesc().getDbName());
    Assert.assertEquals(TBL, statsFromDB.getStatsDesc().getTableName());
    Assert.assertTrue(statsFromDB.getStatsDesc().isIsTblLevel());
    // Compare ColumnStatisticsObj
    Assert.assertEquals(1, statsFromDB.getStatsObjSize());
    ColumnStatisticsObj objFromDB = statsFromDB.getStatsObj().get(0);
    ColumnStatisticsData dataFromDB = objFromDB.getStatsData();
    // Compare ColumnStatisticsData
    Assert.assertEquals(ColumnStatisticsData._Fields.DOUBLE_STATS, dataFromDB.getSetField());
    // Compare DoubleColumnStatsData
    DoubleColumnStatsData doubleDataFromDB = dataFromDB.getDoubleStats();
    Assert.assertEquals(doubleData.getHighValue(), doubleDataFromDB.getHighValue(), 0.01);
    Assert.assertEquals(doubleData.getLowValue(), doubleDataFromDB.getLowValue(), 0.01);
    Assert.assertEquals(doubleData.getNumNulls(), doubleDataFromDB.getNumNulls());
    Assert.assertEquals(doubleData.getNumDVs(), doubleDataFromDB.getNumDVs());
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) DoubleColumnStatsData(org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) Test(org.junit.Test)

Example 78 with ColumnStatistics

use of org.apache.hadoop.hive.metastore.api.ColumnStatistics in project hive by apache.

the class SessionHiveMetaStoreClient method setPartitionColumnStatistics.

/**
 * {@inheritDoc}
 */
@Override
public boolean setPartitionColumnStatistics(SetPartitionsStatsRequest request) throws NoSuchObjectException, InvalidObjectException, MetaException, TException, InvalidInputException {
    if (request.getColStatsSize() == 1) {
        ColumnStatistics colStats = request.getColStatsIterator().next();
        ColumnStatisticsDesc desc = colStats.getStatsDesc();
        String dbName = desc.getDbName().toLowerCase();
        String tableName = desc.getTableName().toLowerCase();
        if (getTempTable(dbName, tableName) != null) {
            return updateTempTableColumnStats(dbName, tableName, colStats);
        }
    }
    return super.setPartitionColumnStatistics(request);
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)

Example 79 with ColumnStatistics

use of org.apache.hadoop.hive.metastore.api.ColumnStatistics in project hive by apache.

the class ColStatsProcessor method constructColumnStatsFromPackedRows.

private List<ColumnStatistics> constructColumnStatsFromPackedRows(Table tbl1) throws HiveException, MetaException, IOException {
    Table tbl = tbl1;
    String partName = null;
    List<String> colName = colStatDesc.getColName();
    List<String> colType = colStatDesc.getColType();
    boolean isTblLevel = colStatDesc.isTblLevel();
    List<ColumnStatistics> stats = new ArrayList<ColumnStatistics>();
    InspectableObject packedRow;
    while ((packedRow = ftOp.getNextRow()) != null) {
        if (packedRow.oi.getCategory() != ObjectInspector.Category.STRUCT) {
            throw new HiveException("Unexpected object type encountered while unpacking row");
        }
        List<ColumnStatisticsObj> statsObjs = new ArrayList<ColumnStatisticsObj>();
        StructObjectInspector soi = (StructObjectInspector) packedRow.oi;
        List<? extends StructField> fields = soi.getAllStructFieldRefs();
        List<Object> list = soi.getStructFieldsDataAsList(packedRow.o);
        List<FieldSchema> partColSchema = tbl.getPartCols();
        // Partition columns are appended at end, we only care about stats column
        int numOfStatCols = isTblLevel ? fields.size() : fields.size() - partColSchema.size();
        assert list != null;
        for (int i = 0; i < numOfStatCols; i++) {
            StructField structField = fields.get(i);
            String columnName = colName.get(i);
            String columnType = colType.get(i);
            Object values = list.get(i);
            try {
                ColumnStatisticsObj statObj = ColumnStatisticsObjTranslator.readHiveStruct(columnName, columnType, structField, values);
                statsObjs.add(statObj);
            } catch (Exception e) {
                if (isStatsReliable) {
                    throw new HiveException("Statistics collection failed while (hive.stats.reliable)", e);
                } else {
                    LOG.debug("Because {} is infinite or NaN, we skip stats.", columnName, e);
                }
            }
        }
        if (!statsObjs.isEmpty()) {
            if (!isTblLevel) {
                List<String> partVals = new ArrayList<String>();
                // Iterate over partition columns to figure out partition name
                for (int i = fields.size() - partColSchema.size(); i < fields.size(); i++) {
                    Object partVal = ((PrimitiveObjectInspector) fields.get(i).getFieldObjectInspector()).getPrimitiveJavaObject(list.get(i));
                    partVals.add(// could be null for default partition
                    partVal == null ? this.conf.getVar(ConfVars.DEFAULTPARTITIONNAME) : partVal.toString());
                }
                partName = Warehouse.makePartName(partColSchema, partVals);
            }
            ColumnStatisticsDesc statsDesc = buildColumnStatsDesc(tbl, partName, isTblLevel);
            ColumnStatistics colStats = new ColumnStatistics();
            colStats.setStatsDesc(statsDesc);
            colStats.setStatsObj(statsObjs);
            stats.add(colStats);
        }
    }
    ftOp.clearFetchContext();
    return stats;
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) Table(org.apache.hadoop.hive.ql.metadata.Table) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ArrayList(java.util.ArrayList) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) IOException(java.io.IOException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) InspectableObject(org.apache.hadoop.hive.serde2.objectinspector.InspectableObject) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) StructField(org.apache.hadoop.hive.serde2.objectinspector.StructField) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) InspectableObject(org.apache.hadoop.hive.serde2.objectinspector.InspectableObject) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 80 with ColumnStatistics

use of org.apache.hadoop.hive.metastore.api.ColumnStatistics in project hive by apache.

the class MetaStoreUtils method aggrPartitionStats.

// Given a list of partStats, this function will give you an aggr stats
public static List<ColumnStatisticsObj> aggrPartitionStats(List<ColumnStatistics> partStats, String dbName, String tableName, List<String> partNames, List<String> colNames, boolean areAllPartsFound, boolean useDensityFunctionForNDVEstimation, double ndvTuner) throws MetaException {
    Map<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>> colStatsMap = new HashMap<ColumnStatsAggregator, List<ColStatsObjWithSourceInfo>>();
    // Group stats by colName for each partition
    Map<String, ColumnStatsAggregator> aliasToAggregator = new HashMap<String, ColumnStatsAggregator>();
    for (ColumnStatistics css : partStats) {
        List<ColumnStatisticsObj> objs = css.getStatsObj();
        for (ColumnStatisticsObj obj : objs) {
            String partName = css.getStatsDesc().getPartName();
            if (aliasToAggregator.get(obj.getColName()) == null) {
                aliasToAggregator.put(obj.getColName(), ColumnStatsAggregatorFactory.getColumnStatsAggregator(obj.getStatsData().getSetField(), useDensityFunctionForNDVEstimation, ndvTuner));
                colStatsMap.put(aliasToAggregator.get(obj.getColName()), new ArrayList<ColStatsObjWithSourceInfo>());
            }
            colStatsMap.get(aliasToAggregator.get(obj.getColName())).add(new ColStatsObjWithSourceInfo(obj, dbName, tableName, partName));
        }
    }
    if (colStatsMap.size() < 1) {
        LOG.debug("No stats data found for: dbName= {},  tblName= {}, partNames= {}, colNames= {}", dbName, tableName, partNames, colNames);
        return new ArrayList<ColumnStatisticsObj>();
    }
    return aggrPartitionStats(colStatsMap, partNames, areAllPartsFound, useDensityFunctionForNDVEstimation, ndvTuner);
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatsAggregator(org.apache.hadoop.hive.metastore.columnstats.aggr.ColumnStatsAggregator) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) MachineList(org.apache.hadoop.util.MachineList) List(java.util.List) ArrayList(java.util.ArrayList)

Aggregations

ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)90 ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)75 ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)67 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)62 ArrayList (java.util.ArrayList)61 Test (org.junit.Test)53 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)40 Table (org.apache.hadoop.hive.metastore.api.Table)38 Partition (org.apache.hadoop.hive.metastore.api.Partition)33 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)32 SerDeInfo (org.apache.hadoop.hive.metastore.api.SerDeInfo)31 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)29 List (java.util.List)26 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)19 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)14 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)13 DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)12 StringColumnStatsData (org.apache.hadoop.hive.metastore.api.StringColumnStatsData)12 HashMap (java.util.HashMap)11 Database (org.apache.hadoop.hive.metastore.api.Database)9