Search in sources :

Example 71 with ColumnStatisticsObj

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.

the class ColumnStatsUpdateTask method constructColumnStatsFromInput.

private ColumnStatistics constructColumnStatsFromInput() throws SemanticException, MetaException {
    // If we are replicating the stats, we don't need to construct those again.
    if (work.getColStats() != null) {
        ColumnStatistics colStats = work.getColStats();
        LOG.debug("Got stats through replication for " + colStats.getStatsDesc().getDbName() + "." + colStats.getStatsDesc().getTableName());
        return colStats;
    }
    String dbName = work.dbName();
    String tableName = work.getTableName();
    String partName = work.getPartName();
    String colName = work.getColName();
    String columnType = work.getColType();
    ColumnStatisticsObj statsObj = new ColumnStatisticsObj();
    // grammar prohibits more than 1 column so we are guaranteed to have only 1
    // element in this lists.
    statsObj.setColName(colName);
    statsObj.setColType(columnType);
    ColumnStatisticsData statsData = new ColumnStatisticsData();
    if (columnType.equalsIgnoreCase("long") || columnType.equalsIgnoreCase("tinyint") || columnType.equalsIgnoreCase("smallint") || columnType.equalsIgnoreCase("int") || columnType.equalsIgnoreCase("bigint")) {
        LongColumnStatsDataInspector longStats = new LongColumnStatsDataInspector();
        longStats.setNumNullsIsSet(false);
        longStats.setNumDVsIsSet(false);
        longStats.setLowValueIsSet(false);
        longStats.setHighValueIsSet(false);
        Map<String, String> mapProp = work.getMapProp();
        for (Entry<String, String> entry : mapProp.entrySet()) {
            String fName = entry.getKey();
            String value = entry.getValue();
            if (fName.equals("numNulls")) {
                longStats.setNumNulls(Long.parseLong(value));
            } else if (fName.equals("numDVs")) {
                longStats.setNumDVs(Long.parseLong(value));
            } else if (fName.equals("lowValue")) {
                longStats.setLowValue(Long.parseLong(value));
            } else if (fName.equals("highValue")) {
                longStats.setHighValue(Long.parseLong(value));
            } else {
                throw new SemanticException("Unknown stat");
            }
        }
        statsData.setLongStats(longStats);
        statsObj.setStatsData(statsData);
    } else if (columnType.equalsIgnoreCase("double") || columnType.equalsIgnoreCase("float")) {
        DoubleColumnStatsDataInspector doubleStats = new DoubleColumnStatsDataInspector();
        doubleStats.setNumNullsIsSet(false);
        doubleStats.setNumDVsIsSet(false);
        doubleStats.setLowValueIsSet(false);
        doubleStats.setHighValueIsSet(false);
        Map<String, String> mapProp = work.getMapProp();
        for (Entry<String, String> entry : mapProp.entrySet()) {
            String fName = entry.getKey();
            String value = entry.getValue();
            if (fName.equals("numNulls")) {
                doubleStats.setNumNulls(Long.parseLong(value));
            } else if (fName.equals("numDVs")) {
                doubleStats.setNumDVs(Long.parseLong(value));
            } else if (fName.equals("lowValue")) {
                doubleStats.setLowValue(Double.parseDouble(value));
            } else if (fName.equals("highValue")) {
                doubleStats.setHighValue(Double.parseDouble(value));
            } else {
                throw new SemanticException("Unknown stat");
            }
        }
        statsData.setDoubleStats(doubleStats);
        statsObj.setStatsData(statsData);
    } else if (columnType.equalsIgnoreCase("string") || columnType.toLowerCase().startsWith("char") || columnType.toLowerCase().startsWith("varchar")) {
        // char(x),varchar(x) types
        StringColumnStatsDataInspector stringStats = new StringColumnStatsDataInspector();
        stringStats.setMaxColLenIsSet(false);
        stringStats.setAvgColLenIsSet(false);
        stringStats.setNumNullsIsSet(false);
        stringStats.setNumDVsIsSet(false);
        Map<String, String> mapProp = work.getMapProp();
        for (Entry<String, String> entry : mapProp.entrySet()) {
            String fName = entry.getKey();
            String value = entry.getValue();
            if (fName.equals("numNulls")) {
                stringStats.setNumNulls(Long.parseLong(value));
            } else if (fName.equals("numDVs")) {
                stringStats.setNumDVs(Long.parseLong(value));
            } else if (fName.equals("avgColLen")) {
                stringStats.setAvgColLen(Double.parseDouble(value));
            } else if (fName.equals("maxColLen")) {
                stringStats.setMaxColLen(Long.parseLong(value));
            } else {
                throw new SemanticException("Unknown stat");
            }
        }
        statsData.setStringStats(stringStats);
        statsObj.setStatsData(statsData);
    } else if (columnType.equalsIgnoreCase("boolean")) {
        BooleanColumnStatsData booleanStats = new BooleanColumnStatsData();
        booleanStats.setNumNullsIsSet(false);
        booleanStats.setNumTruesIsSet(false);
        booleanStats.setNumFalsesIsSet(false);
        Map<String, String> mapProp = work.getMapProp();
        for (Entry<String, String> entry : mapProp.entrySet()) {
            String fName = entry.getKey();
            String value = entry.getValue();
            if (fName.equals("numNulls")) {
                booleanStats.setNumNulls(Long.parseLong(value));
            } else if (fName.equals("numTrues")) {
                booleanStats.setNumTrues(Long.parseLong(value));
            } else if (fName.equals("numFalses")) {
                booleanStats.setNumFalses(Long.parseLong(value));
            } else {
                throw new SemanticException("Unknown stat");
            }
        }
        statsData.setBooleanStats(booleanStats);
        statsObj.setStatsData(statsData);
    } else if (columnType.equalsIgnoreCase("binary")) {
        BinaryColumnStatsData binaryStats = new BinaryColumnStatsData();
        binaryStats.setNumNullsIsSet(false);
        binaryStats.setAvgColLenIsSet(false);
        binaryStats.setMaxColLenIsSet(false);
        Map<String, String> mapProp = work.getMapProp();
        for (Entry<String, String> entry : mapProp.entrySet()) {
            String fName = entry.getKey();
            String value = entry.getValue();
            if (fName.equals("numNulls")) {
                binaryStats.setNumNulls(Long.parseLong(value));
            } else if (fName.equals("avgColLen")) {
                binaryStats.setAvgColLen(Double.parseDouble(value));
            } else if (fName.equals("maxColLen")) {
                binaryStats.setMaxColLen(Long.parseLong(value));
            } else {
                throw new SemanticException("Unknown stat");
            }
        }
        statsData.setBinaryStats(binaryStats);
        statsObj.setStatsData(statsData);
    } else if (columnType.toLowerCase().startsWith("decimal")) {
        // decimal(a,b) type
        DecimalColumnStatsDataInspector decimalStats = new DecimalColumnStatsDataInspector();
        decimalStats.setNumNullsIsSet(false);
        decimalStats.setNumDVsIsSet(false);
        decimalStats.setLowValueIsSet(false);
        decimalStats.setHighValueIsSet(false);
        Map<String, String> mapProp = work.getMapProp();
        for (Entry<String, String> entry : mapProp.entrySet()) {
            String fName = entry.getKey();
            String value = entry.getValue();
            if (fName.equals("numNulls")) {
                decimalStats.setNumNulls(Long.parseLong(value));
            } else if (fName.equals("numDVs")) {
                decimalStats.setNumDVs(Long.parseLong(value));
            } else if (fName.equals("lowValue")) {
                BigDecimal d = new BigDecimal(value);
                decimalStats.setLowValue(DecimalUtils.getDecimal(ByteBuffer.wrap(d.unscaledValue().toByteArray()), (short) d.scale()));
            } else if (fName.equals("highValue")) {
                BigDecimal d = new BigDecimal(value);
                decimalStats.setHighValue(DecimalUtils.getDecimal(ByteBuffer.wrap(d.unscaledValue().toByteArray()), (short) d.scale()));
            } else {
                throw new SemanticException("Unknown stat");
            }
        }
        statsData.setDecimalStats(decimalStats);
        statsObj.setStatsData(statsData);
    } else if (columnType.equalsIgnoreCase("date")) {
        DateColumnStatsDataInspector dateStats = new DateColumnStatsDataInspector();
        Map<String, String> mapProp = work.getMapProp();
        for (Entry<String, String> entry : mapProp.entrySet()) {
            String fName = entry.getKey();
            String value = entry.getValue();
            if (fName.equals("numNulls")) {
                dateStats.setNumNulls(Long.parseLong(value));
            } else if (fName.equals("numDVs")) {
                dateStats.setNumDVs(Long.parseLong(value));
            } else if (fName.equals("lowValue")) {
                // Date high/low value is stored as long in stats DB, but allow users to set high/low
                // value using either date format (yyyy-mm-dd) or numeric format (days since epoch)
                dateStats.setLowValue(readDateValue(value));
            } else if (fName.equals("highValue")) {
                dateStats.setHighValue(readDateValue(value));
            } else {
                throw new SemanticException("Unknown stat");
            }
        }
        statsData.setDateStats(dateStats);
        statsObj.setStatsData(statsData);
    } else if (columnType.equalsIgnoreCase("timestamp")) {
        TimestampColumnStatsDataInspector timestampStats = new TimestampColumnStatsDataInspector();
        Map<String, String> mapProp = work.getMapProp();
        for (Entry<String, String> entry : mapProp.entrySet()) {
            String fName = entry.getKey();
            String value = entry.getValue();
            if (fName.equals("numNulls")) {
                timestampStats.setNumNulls(Long.parseLong(value));
            } else if (fName.equals("numDVs")) {
                timestampStats.setNumDVs(Long.parseLong(value));
            } else if (fName.equals("lowValue")) {
                timestampStats.setLowValue(readTimestampValue(value));
            } else if (fName.equals("highValue")) {
                timestampStats.setHighValue(readTimestampValue(value));
            } else {
                throw new SemanticException("Unknown stat");
            }
        }
        statsData.setTimestampStats(timestampStats);
        statsObj.setStatsData(statsData);
    } else {
        throw new SemanticException("Unsupported type");
    }
    ColumnStatisticsDesc statsDesc = getColumnStatsDesc(dbName, tableName, partName, partName == null);
    ColumnStatistics colStat = new ColumnStatistics();
    colStat.setStatsDesc(statsDesc);
    colStat.addToStatsObj(statsObj);
    colStat.setEngine(Constants.HIVE_ENGINE);
    return colStat;
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) BooleanColumnStatsData(org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData) DateColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DateColumnStatsDataInspector) BinaryColumnStatsData(org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData) BigDecimal(java.math.BigDecimal) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) Entry(java.util.Map.Entry) DecimalColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DecimalColumnStatsDataInspector) DoubleColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.DoubleColumnStatsDataInspector) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) LongColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.LongColumnStatsDataInspector) TimestampColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.TimestampColumnStatsDataInspector) Map(java.util.Map) StringColumnStatsDataInspector(org.apache.hadoop.hive.metastore.columnstats.cache.StringColumnStatsDataInspector) ColumnStatisticsData(org.apache.hadoop.hive.metastore.api.ColumnStatisticsData) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 72 with ColumnStatisticsObj

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.

the class StatsUpdaterThread method getExistingStatsToUpdate.

private List<String> getExistingStatsToUpdate(ColumnStatistics existingStats, Map<String, String> params, boolean isTxnValid) {
    boolean hasAnyAccurate = isTxnValid && StatsSetupConst.areBasicStatsUptoDate(params);
    List<String> colsToUpdate = new ArrayList<>();
    for (ColumnStatisticsObj obj : existingStats.getStatsObj()) {
        String col = obj.getColName();
        if (!hasAnyAccurate || !StatsSetupConst.areColumnStatsUptoDate(params, col)) {
            colsToUpdate.add(col);
        }
    }
    return colsToUpdate;
}
Also used : ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ArrayList(java.util.ArrayList)

Example 73 with ColumnStatisticsObj

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.

the class ColStatsProcessor method constructColumnStatsFromPackedRows.

private boolean constructColumnStatsFromPackedRows(Table tbl, List<ColumnStatistics> stats, long maxNumStats) throws HiveException, MetaException, IOException {
    String partName = null;
    List<String> colName = colStatDesc.getColName();
    List<String> colType = colStatDesc.getColType();
    boolean isTblLevel = colStatDesc.isTblLevel();
    InspectableObject packedRow;
    long numStats = 0;
    while ((packedRow = ftOp.getNextRow()) != null) {
        if (packedRow.oi.getCategory() != ObjectInspector.Category.STRUCT) {
            throw new HiveException("Unexpected object type encountered while unpacking row");
        }
        final List<ColumnStatisticsObj> statsObjs = new ArrayList<>();
        final StructObjectInspector soi = (StructObjectInspector) packedRow.oi;
        final List<? extends StructField> fields = soi.getAllStructFieldRefs();
        final List<Object> values = soi.getStructFieldsDataAsList(packedRow.o);
        // Partition columns are appended at end, we only care about stats column
        int pos = 0;
        for (int i = 0; i < colName.size(); i++) {
            String columnName = colName.get(i);
            String columnType = colType.get(i);
            PrimitiveTypeInfo typeInfo = (PrimitiveTypeInfo) TypeInfoUtils.getTypeInfoFromTypeString(columnType);
            List<ColumnStatsField> columnStatsFields = ColumnStatsType.getColumnStats(typeInfo);
            try {
                ColumnStatisticsObj statObj = ColumnStatisticsObjTranslator.readHiveColumnStatistics(columnName, columnType, columnStatsFields, pos, fields, values);
                statsObjs.add(statObj);
                numStats++;
            } catch (Exception e) {
                if (isStatsReliable) {
                    throw new HiveException("Statistics collection failed while (hive.stats.reliable)", e);
                } else {
                    LOG.debug("Because {} is infinite or NaN, we skip stats.", columnName, e);
                }
            }
            pos += columnStatsFields.size();
        }
        if (!statsObjs.isEmpty()) {
            if (!isTblLevel) {
                List<FieldSchema> partColSchema = tbl.getPartCols();
                List<String> partVals = new ArrayList<>();
                // Iterate over partition columns to figure out partition name
                for (int i = pos; i < pos + partColSchema.size(); i++) {
                    Object partVal = ((PrimitiveObjectInspector) fields.get(i).getFieldObjectInspector()).getPrimitiveJavaObject(values.get(i));
                    partVals.add(// could be null for default partition
                    partVal == null ? this.conf.getVar(ConfVars.DEFAULTPARTITIONNAME) : partVal.toString());
                }
                partName = Warehouse.makePartName(partColSchema, partVals);
            }
            ColumnStatisticsDesc statsDesc = buildColumnStatsDesc(tbl, partName, isTblLevel);
            ColumnStatistics colStats = new ColumnStatistics();
            colStats.setStatsDesc(statsDesc);
            colStats.setStatsObj(statsObjs);
            colStats.setEngine(Constants.HIVE_ENGINE);
            stats.add(colStats);
            if (numStats >= maxNumStats) {
                return false;
            }
        }
    }
    ftOp.clearFetchContext();
    return true;
}
Also used : ColumnStatistics(org.apache.hadoop.hive.metastore.api.ColumnStatistics) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ArrayList(java.util.ArrayList) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) IOException(java.io.IOException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) InspectableObject(org.apache.hadoop.hive.serde2.objectinspector.InspectableObject) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ColumnStatisticsDesc(org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc) InspectableObject(org.apache.hadoop.hive.serde2.objectinspector.InspectableObject) PrimitiveObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.PrimitiveObjectInspector) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Example 74 with ColumnStatisticsObj

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.

the class TestTxnCommands method testParallelInsertStats.

@Test
public void testParallelInsertStats() throws Exception {
    final int TASK_COUNT = 4;
    String tableName = "mm_table";
    List<ColumnStatisticsObj> stats;
    IMetaStoreClient msClient = prepareParallelTest(tableName, 0);
    String[] queries = new String[TASK_COUNT];
    for (int i = 0; i < queries.length; ++i) {
        queries[i] = String.format("insert into %s (a) values (" + i + ")", tableName);
    }
    runParallelQueries(queries);
    // Verify stats are either invalid, or valid and correct.
    stats = getTxnTableStats(msClient, tableName);
    boolean hasStats = 0 != stats.size();
    if (hasStats) {
        verifyLongStats(TASK_COUNT, 0, TASK_COUNT - 1, stats);
    }
    runStatementOnDriver(String.format("insert into %s (a) values (" + TASK_COUNT + ")", tableName));
    if (!hasStats) {
        // Stats should still be invalid if they were invalid.
        stats = getTxnTableStats(msClient, tableName);
        Assert.assertEquals(0, stats.size());
    }
    // Stats should be valid after analyze.
    runStatementOnDriver(String.format("analyze table %s compute statistics for columns", tableName));
    verifyLongStats(TASK_COUNT + 1, 0, TASK_COUNT, getTxnTableStats(msClient, tableName));
}
Also used : ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) Test(org.junit.Test)

Example 75 with ColumnStatisticsObj

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.

the class TestTxnCommands method testTxnStatsOnOff.

@Test
public void testTxnStatsOnOff() throws Exception {
    String tableName = "mm_table";
    hiveConf.setBoolean("hive.stats.autogather", true);
    hiveConf.setBoolean("hive.stats.column.autogather", true);
    // Need to close the thread local Hive object so that configuration change is reflected to HMS.
    Hive.closeCurrent();
    runStatementOnDriver("drop table if exists " + tableName);
    runStatementOnDriver(String.format("create table %s (a int) stored as orc " + "TBLPROPERTIES ('transactional'='true', 'transactional_properties'='insert_only')", tableName));
    runStatementOnDriver(String.format("insert into %s (a) values (1)", tableName));
    IMetaStoreClient msClient = new HiveMetaStoreClient(hiveConf);
    List<ColumnStatisticsObj> stats = getTxnTableStats(msClient, tableName);
    Assert.assertEquals(1, stats.size());
    runStatementOnDriver(String.format("insert into %s (a) values (1)", tableName));
    stats = getTxnTableStats(msClient, tableName);
    Assert.assertEquals(1, stats.size());
    msClient.close();
    hiveConf.setBoolean(MetastoreConf.ConfVars.HIVE_TXN_STATS_ENABLED.getVarname(), false);
    msClient = new HiveMetaStoreClient(hiveConf);
    // Even though the stats are valid in metastore, txn stats are disabled.
    stats = getTxnTableStats(msClient, tableName);
    Assert.assertEquals(0, stats.size());
    msClient.close();
    hiveConf.setBoolean(MetastoreConf.ConfVars.HIVE_TXN_STATS_ENABLED.getVarname(), true);
    msClient = new HiveMetaStoreClient(hiveConf);
    stats = getTxnTableStats(msClient, tableName);
    // Now the stats are visible again.
    Assert.assertEquals(1, stats.size());
    msClient.close();
    hiveConf.setBoolean(MetastoreConf.ConfVars.HIVE_TXN_STATS_ENABLED.getVarname(), false);
    // Need to close the thread local Hive object so that configuration change is reflected to HMS.
    Hive.closeCurrent();
    // Running the query with stats disabled will cause stats in metastore itself to become invalid.
    runStatementOnDriver(String.format("insert into %s (a) values (1)", tableName));
    hiveConf.setBoolean(MetastoreConf.ConfVars.HIVE_TXN_STATS_ENABLED.getVarname(), true);
    msClient = new HiveMetaStoreClient(hiveConf);
    stats = getTxnTableStats(msClient, tableName);
    Assert.assertEquals(0, stats.size());
    msClient.close();
}
Also used : ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) HiveMetaStoreClient(org.apache.hadoop.hive.metastore.HiveMetaStoreClient) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) Test(org.junit.Test)

Aggregations

ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)219 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)104 ArrayList (java.util.ArrayList)98 ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)82 Test (org.junit.Test)79 ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)68 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)43 Table (org.apache.hadoop.hive.metastore.api.Table)43 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)35 Partition (org.apache.hadoop.hive.metastore.api.Partition)35 List (java.util.List)34 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)30 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)29 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)29 HashMap (java.util.HashMap)28 SerDeInfo (org.apache.hadoop.hive.metastore.api.SerDeInfo)28 DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)27 StringColumnStatsData (org.apache.hadoop.hive.metastore.api.StringColumnStatsData)25 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)23 BinaryColumnStatsData (org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)22