Search in sources :

Example 66 with ColumnStatisticsObj

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project flink by apache.

the class HiveCatalog method getPartitionColumnStatistics.

@Override
public CatalogColumnStatistics getPartitionColumnStatistics(ObjectPath tablePath, CatalogPartitionSpec partitionSpec) throws PartitionNotExistException, CatalogException {
    try {
        Partition partition = getHivePartition(tablePath, partitionSpec);
        Table hiveTable = getHiveTable(tablePath);
        String partName = getEscapedPartitionName(tablePath, partitionSpec, hiveTable);
        List<String> partNames = new ArrayList<>();
        partNames.add(partName);
        Map<String, List<ColumnStatisticsObj>> partitionColumnStatistics = client.getPartitionColumnStatistics(partition.getDbName(), partition.getTableName(), partNames, getFieldNames(partition.getSd().getCols()));
        List<ColumnStatisticsObj> columnStatisticsObjs = partitionColumnStatistics.get(partName);
        if (columnStatisticsObjs != null && !columnStatisticsObjs.isEmpty()) {
            return new CatalogColumnStatistics(HiveStatsUtil.createCatalogColumnStats(columnStatisticsObjs, hiveVersion));
        } else {
            return CatalogColumnStatistics.UNKNOWN;
        }
    } catch (TableNotExistException | PartitionSpecInvalidException e) {
        throw new PartitionNotExistException(getName(), tablePath, partitionSpec);
    } catch (TException e) {
        throw new CatalogException(String.format("Failed to get table stats of table %s 's partition %s", tablePath.getFullName(), String.valueOf(partitionSpec)), e);
    }
}
Also used : TException(org.apache.thrift.TException) Partition(org.apache.hadoop.hive.metastore.api.Partition) CatalogPartition(org.apache.flink.table.catalog.CatalogPartition) CatalogTable(org.apache.flink.table.catalog.CatalogTable) SqlCreateHiveTable(org.apache.flink.sql.parser.hive.ddl.SqlCreateHiveTable) Table(org.apache.hadoop.hive.metastore.api.Table) CatalogBaseTable(org.apache.flink.table.catalog.CatalogBaseTable) TableNotExistException(org.apache.flink.table.catalog.exceptions.TableNotExistException) ArrayList(java.util.ArrayList) CatalogException(org.apache.flink.table.catalog.exceptions.CatalogException) CatalogColumnStatistics(org.apache.flink.table.catalog.stats.CatalogColumnStatistics) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ArrayList(java.util.ArrayList) List(java.util.List) PartitionNotExistException(org.apache.flink.table.catalog.exceptions.PartitionNotExistException) PartitionSpecInvalidException(org.apache.flink.table.catalog.exceptions.PartitionSpecInvalidException)

Example 67 with ColumnStatisticsObj

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.

the class TestMRCompactorOnTez method testCompactorGatherStats.

@Test
public void testCompactorGatherStats() throws Exception {
    conf.setBoolVar(HiveConf.ConfVars.HIVE_WRITE_ACID_VERSION_FILE, true);
    conf.setVar(HiveConf.ConfVars.COMPACTOR_JOB_QUEUE, CUSTOM_COMPACTION_QUEUE);
    conf.setBoolVar(HiveConf.ConfVars.HIVE_MR_COMPACTOR_GATHER_STATS, true);
    conf.setBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER, false);
    String tmpFolder = folder.newFolder().getAbsolutePath();
    conf.setVar(HiveConf.ConfVars.HIVE_PROTO_EVENTS_BASE_PATH, tmpFolder);
    String dbName = "default";
    String tableName = "stats_comp_test";
    List<String> colNames = Arrays.asList("a");
    executeStatementOnDriver("drop table if exists " + dbName + "." + tableName, driver);
    executeStatementOnDriver("create table " + dbName + "." + tableName + " (a INT) STORED AS ORC TBLPROPERTIES ('transactional'='true')", driver);
    executeStatementOnDriver("insert into " + dbName + "." + tableName + " values(1)", driver);
    // Make sure we do not have statistics for this table yet
    // Compaction generates stats only if there is any
    IMetaStoreClient msClient = new HiveMetaStoreClient(conf);
    executeStatementOnDriver("analyze table " + dbName + "." + tableName + " compute statistics for columns", driver);
    executeStatementOnDriver("insert into " + dbName + "." + tableName + " values(2)", driver);
    conf.setVar(HiveConf.ConfVars.PREEXECHOOKS, HiveProtoLoggingHook.class.getName());
    // Run major compaction and cleaner
    CompactorTestUtil.runCompaction(conf, dbName, tableName, CompactionType.MAJOR, false);
    conf.setVar(HiveConf.ConfVars.PREEXECHOOKS, StringUtils.EMPTY);
    CompactorTestUtil.runCleaner(conf);
    verifySuccessfulCompaction(1);
    List<ColumnStatisticsObj> colStats = msClient.getTableColumnStatistics(dbName, tableName, colNames, Constants.HIVE_ENGINE);
    assertEquals("Stats should be there", 1, colStats.size());
    assertEquals("Value should contain new data", 2, colStats.get(0).getStatsData().getLongStats().getHighValue());
    assertEquals("Value should contain new data", 1, colStats.get(0).getStatsData().getLongStats().getLowValue());
    executeStatementOnDriver("insert into " + dbName + "." + tableName + " values(3)", driver);
    executeStatementOnDriver("alter table " + dbName + "." + tableName + " set tblproperties('compactor.mapred.job.queue.name'='" + CUSTOM_COMPACTION_QUEUE + "')", driver);
    conf.setVar(HiveConf.ConfVars.PREEXECHOOKS, HiveProtoLoggingHook.class.getName());
    // Run major compaction and cleaner
    CompactorTestUtil.runCompaction(conf, dbName, tableName, CompactionType.MAJOR, false);
    conf.setVar(HiveConf.ConfVars.PREEXECHOOKS, StringUtils.EMPTY);
    CompactorTestUtil.runCleaner(conf);
    verifySuccessfulCompaction(2);
    colStats = msClient.getTableColumnStatistics(dbName, tableName, colNames, Constants.HIVE_ENGINE);
    assertEquals("Stats should be there", 1, colStats.size());
    assertEquals("Value should contain new data", 3, colStats.get(0).getStatsData().getLongStats().getHighValue());
    assertEquals("Value should contain new data", 1, colStats.get(0).getStatsData().getLongStats().getLowValue());
    executeStatementOnDriver("insert into " + dbName + "." + tableName + " values(4)", driver);
    conf.setVar(HiveConf.ConfVars.PREEXECHOOKS, HiveProtoLoggingHook.class.getName());
    CompactorTestUtil.runCompaction(conf, dbName, tableName, CompactionType.MAJOR, false, Collections.singletonMap("compactor.mapred.job.queue.name", CUSTOM_COMPACTION_QUEUE));
    conf.setVar(HiveConf.ConfVars.PREEXECHOOKS, StringUtils.EMPTY);
    CompactorTestUtil.runCleaner(conf);
    verifySuccessfulCompaction(3);
    colStats = msClient.getTableColumnStatistics(dbName, tableName, colNames, Constants.HIVE_ENGINE);
    assertEquals("Stats should be there", 1, colStats.size());
    assertEquals("Value should contain new data", 4, colStats.get(0).getStatsData().getLongStats().getHighValue());
    assertEquals("Value should contain new data", 1, colStats.get(0).getStatsData().getLongStats().getLowValue());
}
Also used : ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) HiveMetaStoreClient(org.apache.hadoop.hive.metastore.HiveMetaStoreClient) IMetaStoreClient(org.apache.hadoop.hive.metastore.IMetaStoreClient) HiveProtoLoggingHook(org.apache.hadoop.hive.ql.hooks.HiveProtoLoggingHook) Test(org.junit.Test)

Example 68 with ColumnStatisticsObj

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.

the class DescTableOperation method getColumnDataColPathSpecified.

private void getColumnDataColPathSpecified(Table table, Partition part, List<FieldSchema> cols, List<ColumnStatisticsObj> colStats, Deserializer deserializer) throws SemanticException, HiveException, MetaException {
    // when column name is specified in describe table DDL, colPath will be db_name.table_name.column_name
    String colName = desc.getColumnPath().split("\\.")[2];
    List<String> colNames = Lists.newArrayList(colName.toLowerCase());
    TableName tableName = HiveTableName.of(desc.getDbTableName());
    if (null == part) {
        if (table.isPartitioned()) {
            Map<String, String> tableProps = table.getParameters() == null ? new HashMap<String, String>() : table.getParameters();
            if (table.isPartitionKey(colNames.get(0))) {
                getColumnDataForPartitionKeyColumn(table, cols, colStats, colNames, tableProps);
            } else {
                getColumnsForNotPartitionKeyColumn(cols, colStats, deserializer, colNames, tableName, tableProps);
            }
            table.setParameters(tableProps);
        } else {
            cols.addAll(Hive.getFieldsFromDeserializer(desc.getColumnPath(), deserializer));
            colStats.addAll(context.getDb().getTableColumnStatistics(tableName.getDb().toLowerCase(), tableName.getTable().toLowerCase(), colNames, false));
        }
    } else {
        List<String> partitions = new ArrayList<String>();
        // The partition name is converted to lowercase before generating the stats. So we should use the same
        // lower case name to get the stats.
        String partName = HMSHandler.lowerCaseConvertPartName(part.getName());
        partitions.add(partName);
        cols.addAll(Hive.getFieldsFromDeserializer(desc.getColumnPath(), deserializer));
        Map<String, List<ColumnStatisticsObj>> partitionColumnStatistics = context.getDb().getPartitionColumnStatistics(tableName.getDb().toLowerCase(), tableName.getTable().toLowerCase(), partitions, colNames, false);
        List<ColumnStatisticsObj> partitionColStat = partitionColumnStatistics.get(partName);
        if (partitionColStat != null) {
            colStats.addAll(partitionColStat);
        }
    }
}
Also used : TableName(org.apache.hadoop.hive.common.TableName) HiveTableName(org.apache.hadoop.hive.ql.parse.HiveTableName) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ArrayList(java.util.ArrayList) ArrayList(java.util.ArrayList) List(java.util.List)

Example 69 with ColumnStatisticsObj

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.

the class DescTableOperation method execute.

@Override
public int execute() throws Exception {
    Table table = getTable();
    Partition part = getPartition(table);
    final String dbTableName = desc.getDbTableName();
    try (DataOutputStream outStream = ShowUtils.getOutputStream(new Path(desc.getResFile()), context)) {
        LOG.debug("DDLTask: got data for {}", dbTableName);
        List<FieldSchema> cols = new ArrayList<>();
        List<ColumnStatisticsObj> colStats = new ArrayList<>();
        Deserializer deserializer = getDeserializer(table);
        if (desc.getColumnPath() == null) {
            getColumnsNoColumnPath(table, part, cols);
        } else {
            if (desc.isFormatted()) {
                getColumnDataColPathSpecified(table, part, cols, colStats, deserializer);
            } else {
                cols.addAll(Hive.getFieldsFromDeserializer(desc.getColumnPath(), deserializer));
            }
        }
        fixDecimalColumnTypeName(cols);
        setConstraintsAndStorageHandlerInfo(table);
        handleMaterializedView(table);
        // In case the query is served by HiveServer2, don't pad it with spaces,
        // as HiveServer2 output is consumed by JDBC/ODBC clients.
        boolean isOutputPadded = !SessionState.get().isHiveServerQuery();
        DescTableFormatter formatter = DescTableFormatter.getFormatter(context.getConf());
        formatter.describeTable(context.getConf(), outStream, desc.getColumnPath(), dbTableName, table, part, cols, desc.isFormatted(), desc.isExtended(), isOutputPadded, colStats);
        LOG.debug("DDLTask: written data for {}", dbTableName);
    } catch (SQLException e) {
        throw new HiveException(e, ErrorMsg.GENERIC_ERROR, dbTableName);
    }
    return 0;
}
Also used : Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.ql.metadata.Partition) Table(org.apache.hadoop.hive.ql.metadata.Table) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) SQLException(java.sql.SQLException) DataOutputStream(java.io.DataOutputStream) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) ArrayList(java.util.ArrayList) ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) Deserializer(org.apache.hadoop.hive.serde2.Deserializer) DescTableFormatter(org.apache.hadoop.hive.ql.ddl.table.info.desc.formatter.DescTableFormatter)

Example 70 with ColumnStatisticsObj

use of org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj in project hive by apache.

the class DDLPlanUtils method getAlterTableStmtTableStatsColsAll.

/**
 * Parses the ColumnStatistics for all the columns in a given table and adds the alter table update
 * statistics command for each column.
 *
 * @param tbl
 */
public List<String> getAlterTableStmtTableStatsColsAll(Table tbl) throws HiveException {
    List<String> alterTblStmt = new ArrayList<String>();
    List<String> accessedColumns = getTableColumnNames(tbl);
    List<ColumnStatisticsObj> tableColumnStatistics = Hive.get().getTableColumnStatistics(tbl.getDbName(), tbl.getTableName(), accessedColumns, true);
    ColumnStatisticsObj[] columnStatisticsObj = tableColumnStatistics.toArray(new ColumnStatisticsObj[0]);
    for (int i = 0; i < columnStatisticsObj.length; i++) {
        alterTblStmt.add(getAlterTableStmtCol(columnStatisticsObj[i].getStatsData(), columnStatisticsObj[i].getColName(), tbl.getTableName(), tbl.getDbName()));
        String base64 = checkBitVectors(columnStatisticsObj[i].getStatsData());
        if (base64 != null) {
            ST command = new ST(EXIST_BIT_VECTORS);
            command.add(DATABASE_NAME, tbl.getDbName());
            command.add(TABLE_NAME, tbl.getTableName());
            command.add(COLUMN_NAME, columnStatisticsObj[i].getColName());
            command.add(BASE_64_VALUE, base64);
            alterTblStmt.add(command.render());
        }
    }
    return alterTblStmt;
}
Also used : ColumnStatisticsObj(org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj) ST(org.stringtemplate.v4.ST) ArrayList(java.util.ArrayList) CheckConstraint(org.apache.hadoop.hive.ql.metadata.CheckConstraint) NotNullConstraint(org.apache.hadoop.hive.ql.metadata.NotNullConstraint) DefaultConstraint(org.apache.hadoop.hive.ql.metadata.DefaultConstraint) UniqueConstraint(org.apache.hadoop.hive.ql.metadata.UniqueConstraint)

Aggregations

ColumnStatisticsObj (org.apache.hadoop.hive.metastore.api.ColumnStatisticsObj)219 ColumnStatisticsData (org.apache.hadoop.hive.metastore.api.ColumnStatisticsData)104 ArrayList (java.util.ArrayList)98 ColumnStatistics (org.apache.hadoop.hive.metastore.api.ColumnStatistics)82 Test (org.junit.Test)79 ColumnStatisticsDesc (org.apache.hadoop.hive.metastore.api.ColumnStatisticsDesc)68 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)43 Table (org.apache.hadoop.hive.metastore.api.Table)43 LongColumnStatsData (org.apache.hadoop.hive.metastore.api.LongColumnStatsData)35 Partition (org.apache.hadoop.hive.metastore.api.Partition)35 List (java.util.List)34 BooleanColumnStatsData (org.apache.hadoop.hive.metastore.api.BooleanColumnStatsData)30 AggrStats (org.apache.hadoop.hive.metastore.api.AggrStats)29 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)29 HashMap (java.util.HashMap)28 SerDeInfo (org.apache.hadoop.hive.metastore.api.SerDeInfo)28 DoubleColumnStatsData (org.apache.hadoop.hive.metastore.api.DoubleColumnStatsData)27 StringColumnStatsData (org.apache.hadoop.hive.metastore.api.StringColumnStatsData)25 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)23 BinaryColumnStatsData (org.apache.hadoop.hive.metastore.api.BinaryColumnStatsData)22