Search in sources :

Example 1 with EnvironmentContext

use of org.apache.hadoop.hive.metastore.api.EnvironmentContext in project hive by apache.

the class DDLTask method alterTableOrSinglePartition.

private int alterTableOrSinglePartition(AlterTableDesc alterTbl, Table tbl, Partition part) throws HiveException {
    EnvironmentContext environmentContext = alterTbl.getEnvironmentContext();
    if (environmentContext == null) {
        environmentContext = new EnvironmentContext();
        alterTbl.setEnvironmentContext(environmentContext);
    }
    // do not need update stats in alter table/partition operations
    if (environmentContext.getProperties() == null || environmentContext.getProperties().get(StatsSetupConst.DO_NOT_UPDATE_STATS) == null) {
        environmentContext.putToProperties(StatsSetupConst.DO_NOT_UPDATE_STATS, StatsSetupConst.TRUE);
    }
    if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.RENAME) {
        tbl.setDbName(Utilities.getDatabaseName(alterTbl.getNewName()));
        tbl.setTableName(Utilities.getTableName(alterTbl.getNewName()));
    } else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.ADDCOLS) {
        StorageDescriptor sd = retrieveStorageDescriptor(tbl, part);
        String serializationLib = sd.getSerdeInfo().getSerializationLib();
        AvroSerdeUtils.handleAlterTableForAvro(conf, serializationLib, tbl.getTTable().getParameters());
        List<FieldSchema> oldCols = (part == null ? tbl.getColsForMetastore() : part.getColsForMetastore());
        List<FieldSchema> newCols = alterTbl.getNewCols();
        if (serializationLib.equals("org.apache.hadoop.hive.serde.thrift.columnsetSerDe")) {
            console.printInfo("Replacing columns for columnsetSerDe and changing to LazySimpleSerDe");
            sd.getSerdeInfo().setSerializationLib(LazySimpleSerDe.class.getName());
            sd.setCols(newCols);
        } else {
            // make sure the columns does not already exist
            Iterator<FieldSchema> iterNewCols = newCols.iterator();
            while (iterNewCols.hasNext()) {
                FieldSchema newCol = iterNewCols.next();
                String newColName = newCol.getName();
                Iterator<FieldSchema> iterOldCols = oldCols.iterator();
                while (iterOldCols.hasNext()) {
                    String oldColName = iterOldCols.next().getName();
                    if (oldColName.equalsIgnoreCase(newColName)) {
                        throw new HiveException(ErrorMsg.DUPLICATE_COLUMN_NAMES, newColName);
                    }
                }
                oldCols.add(newCol);
            }
            sd.setCols(oldCols);
        }
    } else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.RENAMECOLUMN) {
        StorageDescriptor sd = retrieveStorageDescriptor(tbl, part);
        String serializationLib = sd.getSerdeInfo().getSerializationLib();
        AvroSerdeUtils.handleAlterTableForAvro(conf, serializationLib, tbl.getTTable().getParameters());
        List<FieldSchema> oldCols = (part == null ? tbl.getColsForMetastore() : part.getColsForMetastore());
        List<FieldSchema> newCols = new ArrayList<FieldSchema>();
        Iterator<FieldSchema> iterOldCols = oldCols.iterator();
        String oldName = alterTbl.getOldColName();
        String newName = alterTbl.getNewColName();
        String type = alterTbl.getNewColType();
        String comment = alterTbl.getNewColComment();
        boolean first = alterTbl.getFirst();
        String afterCol = alterTbl.getAfterCol();
        // if orc table, restrict reordering columns as it will break schema evolution
        boolean isOrcSchemaEvolution = sd.getInputFormat().equals(OrcInputFormat.class.getName()) && isSchemaEvolutionEnabled(tbl);
        if (isOrcSchemaEvolution && (first || (afterCol != null && !afterCol.trim().isEmpty()))) {
            throw new HiveException(ErrorMsg.CANNOT_REORDER_COLUMNS, alterTbl.getOldName());
        }
        FieldSchema column = null;
        boolean found = false;
        int position = -1;
        if (first) {
            position = 0;
        }
        int i = 1;
        while (iterOldCols.hasNext()) {
            FieldSchema col = iterOldCols.next();
            String oldColName = col.getName();
            if (oldColName.equalsIgnoreCase(newName) && !oldColName.equalsIgnoreCase(oldName)) {
                throw new HiveException(ErrorMsg.DUPLICATE_COLUMN_NAMES, newName);
            } else if (oldColName.equalsIgnoreCase(oldName)) {
                col.setName(newName);
                if (type != null && !type.trim().equals("")) {
                    col.setType(type);
                }
                if (comment != null) {
                    col.setComment(comment);
                }
                found = true;
                if (first || (afterCol != null && !afterCol.trim().equals(""))) {
                    column = col;
                    continue;
                }
            }
            if (afterCol != null && !afterCol.trim().equals("") && oldColName.equalsIgnoreCase(afterCol)) {
                position = i;
            }
            i++;
            newCols.add(col);
        }
        // did not find the column
        if (!found) {
            throw new HiveException(ErrorMsg.INVALID_COLUMN, oldName);
        }
        // after column is not null, but we did not find it.
        if ((afterCol != null && !afterCol.trim().equals("")) && position < 0) {
            throw new HiveException(ErrorMsg.INVALID_COLUMN, afterCol);
        }
        if (position >= 0) {
            newCols.add(position, column);
        }
        sd.setCols(newCols);
    } else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.REPLACECOLS) {
        StorageDescriptor sd = retrieveStorageDescriptor(tbl, part);
        // change SerDe to LazySimpleSerDe if it is columnsetSerDe
        String serializationLib = sd.getSerdeInfo().getSerializationLib();
        if (serializationLib.equals("org.apache.hadoop.hive.serde.thrift.columnsetSerDe")) {
            console.printInfo("Replacing columns for columnsetSerDe and changing to LazySimpleSerDe");
            sd.getSerdeInfo().setSerializationLib(LazySimpleSerDe.class.getName());
        } else if (!serializationLib.equals(MetadataTypedColumnsetSerDe.class.getName()) && !serializationLib.equals(LazySimpleSerDe.class.getName()) && !serializationLib.equals(ColumnarSerDe.class.getName()) && !serializationLib.equals(DynamicSerDe.class.getName()) && !serializationLib.equals(ParquetHiveSerDe.class.getName()) && !serializationLib.equals(OrcSerde.class.getName())) {
            throw new HiveException(ErrorMsg.CANNOT_REPLACE_COLUMNS, alterTbl.getOldName());
        }
        final boolean isOrcSchemaEvolution = serializationLib.equals(OrcSerde.class.getName()) && isSchemaEvolutionEnabled(tbl);
        // adding columns and limited integer type promotion is supported for ORC schema evolution
        if (isOrcSchemaEvolution) {
            final List<FieldSchema> existingCols = sd.getCols();
            final List<FieldSchema> replaceCols = alterTbl.getNewCols();
            if (replaceCols.size() < existingCols.size()) {
                throw new HiveException(ErrorMsg.REPLACE_CANNOT_DROP_COLUMNS, alterTbl.getOldName());
            }
        }
        sd.setCols(alterTbl.getNewCols());
    } else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.ADDPROPS) {
        if (StatsSetupConst.USER.equals(environmentContext.getProperties().get(StatsSetupConst.STATS_GENERATED))) {
            environmentContext.getProperties().remove(StatsSetupConst.DO_NOT_UPDATE_STATS);
        }
        if (part != null) {
            part.getTPartition().getParameters().putAll(alterTbl.getProps());
        } else {
            tbl.getTTable().getParameters().putAll(alterTbl.getProps());
        }
    } else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.DROPPROPS) {
        Iterator<String> keyItr = alterTbl.getProps().keySet().iterator();
        if (StatsSetupConst.USER.equals(environmentContext.getProperties().get(StatsSetupConst.STATS_GENERATED))) {
            // drop a stats parameter, which triggers recompute stats update automatically
            environmentContext.getProperties().remove(StatsSetupConst.DO_NOT_UPDATE_STATS);
        }
        while (keyItr.hasNext()) {
            if (part != null) {
                part.getTPartition().getParameters().remove(keyItr.next());
            } else {
                tbl.getTTable().getParameters().remove(keyItr.next());
            }
        }
    } else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.ADDSERDEPROPS) {
        StorageDescriptor sd = retrieveStorageDescriptor(tbl, part);
        sd.getSerdeInfo().getParameters().putAll(alterTbl.getProps());
    } else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.ADDSERDE) {
        StorageDescriptor sd = retrieveStorageDescriptor(tbl, part);
        String serdeName = alterTbl.getSerdeName();
        String oldSerdeName = sd.getSerdeInfo().getSerializationLib();
        // if orc table, restrict changing the serde as it can break schema evolution
        if (isSchemaEvolutionEnabled(tbl) && oldSerdeName.equalsIgnoreCase(OrcSerde.class.getName()) && !serdeName.equalsIgnoreCase(OrcSerde.class.getName())) {
            throw new HiveException(ErrorMsg.CANNOT_CHANGE_SERDE, OrcSerde.class.getSimpleName(), alterTbl.getOldName());
        }
        sd.getSerdeInfo().setSerializationLib(serdeName);
        if ((alterTbl.getProps() != null) && (alterTbl.getProps().size() > 0)) {
            sd.getSerdeInfo().getParameters().putAll(alterTbl.getProps());
        }
        if (part != null) {
            // TODO: wtf? This doesn't do anything.
            part.getTPartition().getSd().setCols(part.getTPartition().getSd().getCols());
        } else {
            if (Table.shouldStoreFieldsInMetastore(conf, serdeName, tbl.getParameters()) && !Table.hasMetastoreBasedSchema(conf, oldSerdeName)) {
                // from old SerDe are too long to be stored in metastore, but there's nothing we can do.
                try {
                    Deserializer oldSerde = MetaStoreUtils.getDeserializer(conf, tbl.getTTable(), false, oldSerdeName);
                    tbl.setFields(Hive.getFieldsFromDeserializer(tbl.getTableName(), oldSerde));
                } catch (MetaException ex) {
                    throw new HiveException(ex);
                }
            }
        }
    } else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.ADDFILEFORMAT) {
        StorageDescriptor sd = retrieveStorageDescriptor(tbl, part);
        // if orc table, restrict changing the file format as it can break schema evolution
        if (isSchemaEvolutionEnabled(tbl) && sd.getInputFormat().equals(OrcInputFormat.class.getName()) && !alterTbl.getInputFormat().equals(OrcInputFormat.class.getName())) {
            throw new HiveException(ErrorMsg.CANNOT_CHANGE_FILEFORMAT, "ORC", alterTbl.getOldName());
        }
        sd.setInputFormat(alterTbl.getInputFormat());
        sd.setOutputFormat(alterTbl.getOutputFormat());
        if (alterTbl.getSerdeName() != null) {
            sd.getSerdeInfo().setSerializationLib(alterTbl.getSerdeName());
        }
    } else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.ADDCLUSTERSORTCOLUMN) {
        StorageDescriptor sd = retrieveStorageDescriptor(tbl, part);
        // validate sort columns and bucket columns
        List<String> columns = Utilities.getColumnNamesFromFieldSchema(tbl.getCols());
        if (!alterTbl.isTurnOffSorting()) {
            Utilities.validateColumnNames(columns, alterTbl.getBucketColumns());
        }
        if (alterTbl.getSortColumns() != null) {
            Utilities.validateColumnNames(columns, Utilities.getColumnNamesFromSortCols(alterTbl.getSortColumns()));
        }
        if (alterTbl.isTurnOffSorting()) {
            sd.setSortCols(new ArrayList<Order>());
        } else if (alterTbl.getNumberBuckets() == -1) {
            // -1 buckets means to turn off bucketing
            sd.setBucketCols(new ArrayList<String>());
            sd.setNumBuckets(-1);
            sd.setSortCols(new ArrayList<Order>());
        } else {
            sd.setBucketCols(alterTbl.getBucketColumns());
            sd.setNumBuckets(alterTbl.getNumberBuckets());
            sd.setSortCols(alterTbl.getSortColumns());
        }
    } else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.ALTERLOCATION) {
        StorageDescriptor sd = retrieveStorageDescriptor(tbl, part);
        String newLocation = alterTbl.getNewLocation();
        try {
            URI locUri = new URI(newLocation);
            if (!new Path(locUri).isAbsolute()) {
                throw new HiveException(ErrorMsg.BAD_LOCATION_VALUE, newLocation);
            }
            sd.setLocation(newLocation);
        } catch (URISyntaxException e) {
            throw new HiveException(e);
        }
        environmentContext.getProperties().remove(StatsSetupConst.DO_NOT_UPDATE_STATS);
    } else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.ADDSKEWEDBY) {
        // Validation's been done at compile time. no validation is needed here.
        List<String> skewedColNames = null;
        List<List<String>> skewedValues = null;
        if (alterTbl.isTurnOffSkewed()) {
            // Convert skewed table to non-skewed table.
            skewedColNames = new ArrayList<String>();
            skewedValues = new ArrayList<List<String>>();
        } else {
            skewedColNames = alterTbl.getSkewedColNames();
            skewedValues = alterTbl.getSkewedColValues();
        }
        if (null == tbl.getSkewedInfo()) {
            // Convert non-skewed table to skewed table.
            SkewedInfo skewedInfo = new SkewedInfo();
            skewedInfo.setSkewedColNames(skewedColNames);
            skewedInfo.setSkewedColValues(skewedValues);
            tbl.setSkewedInfo(skewedInfo);
        } else {
            tbl.setSkewedColNames(skewedColNames);
            tbl.setSkewedColValues(skewedValues);
        }
        tbl.setStoredAsSubDirectories(alterTbl.isStoredAsSubDirectories());
    } else if (alterTbl.getOp() == AlterTableDesc.AlterTableTypes.ALTERSKEWEDLOCATION) {
        // process location one-by-one
        Map<List<String>, String> locMaps = alterTbl.getSkewedLocations();
        Set<List<String>> keys = locMaps.keySet();
        for (List<String> key : keys) {
            String newLocation = locMaps.get(key);
            try {
                URI locUri = new URI(newLocation);
                if (part != null) {
                    List<String> slk = new ArrayList<String>(key);
                    part.setSkewedValueLocationMap(slk, locUri.toString());
                } else {
                    List<String> slk = new ArrayList<String>(key);
                    tbl.setSkewedValueLocationMap(slk, locUri.toString());
                }
            } catch (URISyntaxException e) {
                throw new HiveException(e);
            }
        }
        environmentContext.getProperties().remove(StatsSetupConst.DO_NOT_UPDATE_STATS);
    } else if (alterTbl.getOp() == AlterTableTypes.ALTERBUCKETNUM) {
        if (part != null) {
            if (part.getBucketCount() == alterTbl.getNumberBuckets()) {
                return 0;
            }
            part.setBucketCount(alterTbl.getNumberBuckets());
        } else {
            if (tbl.getNumBuckets() == alterTbl.getNumberBuckets()) {
                return 0;
            }
            tbl.setNumBuckets(alterTbl.getNumberBuckets());
        }
    } else {
        throw new HiveException(ErrorMsg.UNSUPPORTED_ALTER_TBL_OP, alterTbl.getOp().toString());
    }
    return 0;
}
Also used : HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) LazySimpleSerDe(org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) StorageDescriptor(org.apache.hadoop.hive.metastore.api.StorageDescriptor) ArrayList(java.util.ArrayList) URISyntaxException(java.net.URISyntaxException) URI(java.net.URI) EnvironmentContext(org.apache.hadoop.hive.metastore.api.EnvironmentContext) SkewedInfo(org.apache.hadoop.hive.metastore.api.SkewedInfo) Iterator(java.util.Iterator) ArrayList(java.util.ArrayList) AbstractList(java.util.AbstractList) List(java.util.List) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) Order(org.apache.hadoop.hive.metastore.api.Order) Path(org.apache.hadoop.fs.Path) DynamicSerDe(org.apache.hadoop.hive.serde2.dynamic_type.DynamicSerDe) MetadataTypedColumnsetSerDe(org.apache.hadoop.hive.serde2.MetadataTypedColumnsetSerDe) OrcSerde(org.apache.hadoop.hive.ql.io.orc.OrcSerde) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) Deserializer(org.apache.hadoop.hive.serde2.Deserializer)

Example 2 with EnvironmentContext

use of org.apache.hadoop.hive.metastore.api.EnvironmentContext in project hive by apache.

the class DDLTask method touch.

/**
   * Rewrite the partition's metadata and force the pre/post execute hooks to
   * be fired.
   *
   * @param db
   * @param touchDesc
   * @return
   * @throws HiveException
   */
private int touch(Hive db, AlterTableSimpleDesc touchDesc) throws HiveException {
    Table tbl = db.getTable(touchDesc.getTableName());
    EnvironmentContext environmentContext = new EnvironmentContext();
    environmentContext.putToProperties(StatsSetupConst.DO_NOT_UPDATE_STATS, StatsSetupConst.TRUE);
    if (touchDesc.getPartSpec() == null) {
        try {
            db.alterTable(touchDesc.getTableName(), tbl, environmentContext);
        } catch (InvalidOperationException e) {
            throw new HiveException("Uable to update table");
        }
        work.getInputs().add(new ReadEntity(tbl));
        addIfAbsentByName(new WriteEntity(tbl, WriteEntity.WriteType.DDL_NO_LOCK));
    } else {
        Partition part = db.getPartition(tbl, touchDesc.getPartSpec(), false);
        if (part == null) {
            throw new HiveException("Specified partition does not exist");
        }
        try {
            db.alterPartition(touchDesc.getTableName(), part, environmentContext);
        } catch (InvalidOperationException e) {
            throw new HiveException(e);
        }
        work.getInputs().add(new ReadEntity(part));
        addIfAbsentByName(new WriteEntity(part, WriteEntity.WriteType.DDL_NO_LOCK));
    }
    return 0;
}
Also used : EnvironmentContext(org.apache.hadoop.hive.metastore.api.EnvironmentContext) ReadEntity(org.apache.hadoop.hive.ql.hooks.ReadEntity) Partition(org.apache.hadoop.hive.ql.metadata.Partition) AlterTableExchangePartition(org.apache.hadoop.hive.ql.plan.AlterTableExchangePartition) Table(org.apache.hadoop.hive.ql.metadata.Table) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) InvalidOperationException(org.apache.hadoop.hive.metastore.api.InvalidOperationException) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity)

Example 3 with EnvironmentContext

use of org.apache.hadoop.hive.metastore.api.EnvironmentContext in project hive by apache.

the class DDLSemanticAnalyzer method analyzeAlterTableProps.

private void analyzeAlterTableProps(String[] qualified, HashMap<String, String> partSpec, ASTNode ast, boolean expectView, boolean isUnset) throws SemanticException {
    String tableName = getDotName(qualified);
    HashMap<String, String> mapProp = getProps((ASTNode) (ast.getChild(0)).getChild(0));
    EnvironmentContext environmentContext = null;
    // we need to check if the properties are valid, especially for stats.
    // they might be changed via alter table .. update statistics or
    // alter table .. set tblproperties. If the property is not row_count
    // or raw_data_size, it could not be changed through update statistics
    boolean changeStatsSucceeded = false;
    for (Entry<String, String> entry : mapProp.entrySet()) {
        // wrong.
        if (entry.getKey().equals(StatsSetupConst.ROW_COUNT) || entry.getKey().equals(StatsSetupConst.RAW_DATA_SIZE)) {
            try {
                Long.parseLong(entry.getValue());
                changeStatsSucceeded = true;
            } catch (Exception e) {
                throw new SemanticException("AlterTable " + entry.getKey() + " failed with value " + entry.getValue());
            }
        } else {
            if (queryState.getCommandType().equals(HiveOperation.ALTERTABLE_UPDATETABLESTATS.getOperationName()) || queryState.getCommandType().equals(HiveOperation.ALTERTABLE_UPDATEPARTSTATS.getOperationName())) {
                throw new SemanticException("AlterTable UpdateStats " + entry.getKey() + " failed because the only valid keys are " + StatsSetupConst.ROW_COUNT + " and " + StatsSetupConst.RAW_DATA_SIZE);
            }
        }
        if (changeStatsSucceeded) {
            environmentContext = new EnvironmentContext();
            environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, StatsSetupConst.USER);
        }
    }
    AlterTableDesc alterTblDesc = null;
    if (isUnset == true) {
        alterTblDesc = new AlterTableDesc(AlterTableTypes.DROPPROPS, partSpec, expectView);
        if (ast.getChild(1) != null) {
            alterTblDesc.setDropIfExists(true);
        }
    } else {
        alterTblDesc = new AlterTableDesc(AlterTableTypes.ADDPROPS, partSpec, expectView);
    }
    alterTblDesc.setProps(mapProp);
    alterTblDesc.setEnvironmentContext(environmentContext);
    alterTblDesc.setOldName(tableName);
    addInputsOutputsAlterTable(tableName, partSpec, alterTblDesc);
    rootTasks.add(TaskFactory.get(new DDLWork(getInputs(), getOutputs(), alterTblDesc), conf));
}
Also used : EnvironmentContext(org.apache.hadoop.hive.metastore.api.EnvironmentContext) AlterTableDesc(org.apache.hadoop.hive.ql.plan.AlterTableDesc) DDLWork(org.apache.hadoop.hive.ql.plan.DDLWork) LockException(org.apache.hadoop.hive.ql.lockmgr.LockException) InvocationTargetException(java.lang.reflect.InvocationTargetException) NoSuchObjectException(org.apache.hadoop.hive.metastore.api.NoSuchObjectException) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) URISyntaxException(java.net.URISyntaxException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) InvalidTableException(org.apache.hadoop.hive.ql.metadata.InvalidTableException)

Example 4 with EnvironmentContext

use of org.apache.hadoop.hive.metastore.api.EnvironmentContext in project hive by apache.

the class StatsTask method aggregateStats.

private int aggregateStats(Hive db) {
    StatsAggregator statsAggregator = null;
    int ret = 0;
    StatsCollectionContext scc = null;
    EnvironmentContext environmentContext = null;
    try {
        // Stats setup:
        final Warehouse wh = new Warehouse(conf);
        if (!getWork().getNoStatsAggregator() && !getWork().isNoScanAnalyzeCommand()) {
            try {
                scc = getContext();
                statsAggregator = createStatsAggregator(scc, conf);
            } catch (HiveException e) {
                if (HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_RELIABLE)) {
                    throw e;
                }
                console.printError(ErrorMsg.STATS_SKIPPING_BY_ERROR.getErrorCodedMsg(e.toString()));
            }
        }
        List<Partition> partitions = getPartitionsList(db);
        boolean atomic = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_STATS_ATOMIC);
        String tableFullName = table.getDbName() + "." + table.getTableName();
        if (partitions == null) {
            org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable();
            Map<String, String> parameters = tTable.getParameters();
            // acidTable will not have accurate stats unless it is set through analyze command.
            if (work.getTableSpecs() == null && AcidUtils.isAcidTable(table)) {
                StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE);
            } else if (work.getTableSpecs() != null || (work.getLoadTableDesc() != null && work.getLoadTableDesc().getReplace()) || (work.getLoadFileDesc() != null && !work.getLoadFileDesc().getDestinationCreateTable().isEmpty())) {
                StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.TRUE);
            }
            // non-partitioned tables:
            if (!existStats(parameters) && atomic) {
                return 0;
            }
            // For eg. if a file is being loaded, the old number of rows are not valid
            if (work.isClearAggregatorStats()) {
                // we choose to keep the invalid stats and only change the setting.
                StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE);
            }
            updateQuickStats(wh, parameters, tTable.getSd());
            if (StatsSetupConst.areBasicStatsUptoDate(parameters)) {
                if (statsAggregator != null) {
                    String prefix = getAggregationPrefix(table, null);
                    updateStats(statsAggregator, parameters, prefix, atomic);
                }
                // write table stats to metastore
                if (!getWork().getNoStatsAggregator()) {
                    environmentContext = new EnvironmentContext();
                    environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, StatsSetupConst.TASK);
                }
            }
            getHive().alterTable(tableFullName, new Table(tTable), environmentContext);
            if (conf.getBoolVar(ConfVars.TEZ_EXEC_SUMMARY)) {
                console.printInfo("Table " + tableFullName + " stats: [" + toString(parameters) + ']');
            }
            LOG.info("Table " + tableFullName + " stats: [" + toString(parameters) + ']');
        } else {
            // Partitioned table:
            // Need to get the old stats of the partition
            // and update the table stats based on the old and new stats.
            List<Partition> updates = new ArrayList<Partition>();
            //Get the file status up-front for all partitions. Beneficial in cases of blob storage systems
            final Map<String, FileStatus[]> fileStatusMap = new ConcurrentHashMap<String, FileStatus[]>();
            int poolSize = conf.getInt(ConfVars.HIVE_MOVE_FILES_THREAD_COUNT.varname, 1);
            // In case thread count is set to 0, use single thread.
            poolSize = Math.max(poolSize, 1);
            final ExecutorService pool = Executors.newFixedThreadPool(poolSize, new ThreadFactoryBuilder().setDaemon(true).setNameFormat("stats-updater-thread-%d").build());
            final List<Future<Void>> futures = Lists.newLinkedList();
            LOG.debug("Getting file stats of all partitions. threadpool size:" + poolSize);
            try {
                for (final Partition partn : partitions) {
                    final String partitionName = partn.getName();
                    final org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition();
                    Map<String, String> parameters = tPart.getParameters();
                    if (!existStats(parameters) && atomic) {
                        continue;
                    }
                    futures.add(pool.submit(new Callable<Void>() {

                        @Override
                        public Void call() throws Exception {
                            FileStatus[] partfileStatus = wh.getFileStatusesForSD(tPart.getSd());
                            fileStatusMap.put(partitionName, partfileStatus);
                            return null;
                        }
                    }));
                }
                pool.shutdown();
                for (Future<Void> future : futures) {
                    future.get();
                }
            } catch (InterruptedException e) {
                LOG.debug("Cancelling " + futures.size() + " file stats lookup tasks");
                //cancel other futures
                for (Future future : futures) {
                    future.cancel(true);
                }
                // Fail the query if the stats are supposed to be reliable
                if (work.isStatsReliable()) {
                    ret = 1;
                }
            } finally {
                if (pool != null) {
                    pool.shutdownNow();
                }
                LOG.debug("Finished getting file stats of all partitions");
            }
            for (Partition partn : partitions) {
                //
                // get the old partition stats
                //
                org.apache.hadoop.hive.metastore.api.Partition tPart = partn.getTPartition();
                Map<String, String> parameters = tPart.getParameters();
                if (work.getTableSpecs() == null && AcidUtils.isAcidTable(table)) {
                    StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE);
                } else if (work.getTableSpecs() != null || (work.getLoadTableDesc() != null && work.getLoadTableDesc().getReplace()) || (work.getLoadFileDesc() != null && !work.getLoadFileDesc().getDestinationCreateTable().isEmpty())) {
                    StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.TRUE);
                }
                //only when the stats exist, it is added to fileStatusMap
                if (!fileStatusMap.containsKey(partn.getName())) {
                    continue;
                }
                // For eg. if a file is being loaded, the old number of rows are not valid
                if (work.isClearAggregatorStats()) {
                    // we choose to keep the invalid stats and only change the setting.
                    StatsSetupConst.setBasicStatsState(parameters, StatsSetupConst.FALSE);
                }
                updateQuickStats(parameters, fileStatusMap.get(partn.getName()));
                if (StatsSetupConst.areBasicStatsUptoDate(parameters)) {
                    if (statsAggregator != null) {
                        String prefix = getAggregationPrefix(table, partn);
                        updateStats(statsAggregator, parameters, prefix, atomic);
                    }
                    if (!getWork().getNoStatsAggregator()) {
                        environmentContext = new EnvironmentContext();
                        environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, StatsSetupConst.TASK);
                    }
                }
                updates.add(new Partition(table, tPart));
                if (conf.getBoolVar(ConfVars.TEZ_EXEC_SUMMARY)) {
                    console.printInfo("Partition " + tableFullName + partn.getSpec() + " stats: [" + toString(parameters) + ']');
                }
                LOG.info("Partition " + tableFullName + partn.getSpec() + " stats: [" + toString(parameters) + ']');
            }
            if (!updates.isEmpty()) {
                db.alterPartitions(tableFullName, updates, environmentContext);
            }
        }
    } catch (Exception e) {
        console.printInfo("[Warning] could not update stats.", "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e));
        // Fail the query if the stats are supposed to be reliable
        if (work.isStatsReliable()) {
            ret = 1;
        }
    } finally {
        if (statsAggregator != null) {
            statsAggregator.closeConnection(scc);
        }
    }
    // anything else indicates failure
    return ret;
}
Also used : Warehouse(org.apache.hadoop.hive.metastore.Warehouse) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) FileStatus(org.apache.hadoop.fs.FileStatus) ArrayList(java.util.ArrayList) Callable(java.util.concurrent.Callable) EnvironmentContext(org.apache.hadoop.hive.metastore.api.EnvironmentContext) ThreadFactoryBuilder(com.google.common.util.concurrent.ThreadFactoryBuilder) ConcurrentHashMap(java.util.concurrent.ConcurrentHashMap) StatsCollectionContext(org.apache.hadoop.hive.ql.stats.StatsCollectionContext) Partition(org.apache.hadoop.hive.ql.metadata.Partition) Table(org.apache.hadoop.hive.ql.metadata.Table) StatsAggregator(org.apache.hadoop.hive.ql.stats.StatsAggregator) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) ExecutorService(java.util.concurrent.ExecutorService) Future(java.util.concurrent.Future)

Example 5 with EnvironmentContext

use of org.apache.hadoop.hive.metastore.api.EnvironmentContext in project hive by apache.

the class StatsNoJobTask method aggregateStats.

private int aggregateStats(ExecutorService threadPool, Hive db) {
    int ret = 0;
    try {
        Collection<Partition> partitions = null;
        if (work.getPrunedPartitionList() == null) {
            partitions = getPartitionsList();
        } else {
            partitions = work.getPrunedPartitionList().getPartitions();
        }
        // non-partitioned table
        if (partitions == null) {
            org.apache.hadoop.hive.metastore.api.Table tTable = table.getTTable();
            Map<String, String> parameters = tTable.getParameters();
            try {
                Path dir = new Path(tTable.getSd().getLocation());
                long numRows = 0;
                long rawDataSize = 0;
                long fileSize = 0;
                long numFiles = 0;
                FileSystem fs = dir.getFileSystem(conf);
                FileStatus[] fileList = HiveStatsUtils.getFileStatusRecurse(dir, -1, fs);
                boolean statsAvailable = false;
                for (FileStatus file : fileList) {
                    if (!file.isDir()) {
                        InputFormat<?, ?> inputFormat = ReflectionUtil.newInstance(table.getInputFormatClass(), jc);
                        InputSplit dummySplit = new FileSplit(file.getPath(), 0, 0, new String[] { table.getDataLocation().toString() });
                        if (file.getLen() == 0) {
                            numFiles += 1;
                            statsAvailable = true;
                        } else {
                            org.apache.hadoop.mapred.RecordReader<?, ?> recordReader = inputFormat.getRecordReader(dummySplit, jc, Reporter.NULL);
                            StatsProvidingRecordReader statsRR;
                            if (recordReader instanceof StatsProvidingRecordReader) {
                                statsRR = (StatsProvidingRecordReader) recordReader;
                                numRows += statsRR.getStats().getRowCount();
                                rawDataSize += statsRR.getStats().getRawDataSize();
                                fileSize += file.getLen();
                                numFiles += 1;
                                statsAvailable = true;
                            }
                            recordReader.close();
                        }
                    }
                }
                if (statsAvailable) {
                    parameters.put(StatsSetupConst.ROW_COUNT, String.valueOf(numRows));
                    parameters.put(StatsSetupConst.RAW_DATA_SIZE, String.valueOf(rawDataSize));
                    parameters.put(StatsSetupConst.TOTAL_SIZE, String.valueOf(fileSize));
                    parameters.put(StatsSetupConst.NUM_FILES, String.valueOf(numFiles));
                    EnvironmentContext environmentContext = new EnvironmentContext();
                    environmentContext.putToProperties(StatsSetupConst.STATS_GENERATED, StatsSetupConst.TASK);
                    db.alterTable(tableFullName, new Table(tTable), environmentContext);
                    String msg = "Table " + tableFullName + " stats: [" + toString(parameters) + ']';
                    LOG.debug(msg);
                    console.printInfo(msg);
                } else {
                    String msg = "Table " + tableFullName + " does not provide stats.";
                    LOG.debug(msg);
                }
            } catch (Exception e) {
                console.printInfo("[Warning] could not update stats for " + tableFullName + ".", "Failed with exception " + e.getMessage() + "\n" + StringUtils.stringifyException(e));
            }
        } else {
            // Partitioned table
            for (Partition partn : partitions) {
                threadPool.execute(new StatsCollection(partn));
            }
            LOG.debug("Stats collection waiting for threadpool to shutdown..");
            shutdownAndAwaitTermination(threadPool);
            LOG.debug("Stats collection threadpool shutdown successful.");
            ret = updatePartitions(db);
        }
    } catch (Exception e) {
        // Fail the query if the stats are supposed to be reliable
        if (work.isStatsReliable()) {
            ret = -1;
        }
    }
    // anything else indicates failure
    return ret;
}
Also used : FileStatus(org.apache.hadoop.fs.FileStatus) StatsProvidingRecordReader(org.apache.hadoop.hive.ql.io.StatsProvidingRecordReader) FileSplit(org.apache.hadoop.mapred.FileSplit) EnvironmentContext(org.apache.hadoop.hive.metastore.api.EnvironmentContext) FileSystem(org.apache.hadoop.fs.FileSystem) InputSplit(org.apache.hadoop.mapred.InputSplit) Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.ql.metadata.Partition) Table(org.apache.hadoop.hive.ql.metadata.Table) InvalidOperationException(org.apache.hadoop.hive.metastore.api.InvalidOperationException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Aggregations

EnvironmentContext (org.apache.hadoop.hive.metastore.api.EnvironmentContext)11 ArrayList (java.util.ArrayList)5 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)5 Partition (org.apache.hadoop.hive.ql.metadata.Partition)5 Path (org.apache.hadoop.fs.Path)4 MetaException (org.apache.hadoop.hive.metastore.api.MetaException)4 InvalidOperationException (org.apache.hadoop.hive.metastore.api.InvalidOperationException)3 Table (org.apache.hadoop.hive.ql.metadata.Table)3 URISyntaxException (java.net.URISyntaxException)2 List (java.util.List)2 FileStatus (org.apache.hadoop.fs.FileStatus)2 FileSystem (org.apache.hadoop.fs.FileSystem)2 HiveConf (org.apache.hadoop.hive.conf.HiveConf)2 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)2 SkewedInfo (org.apache.hadoop.hive.metastore.api.SkewedInfo)2 StorageDescriptor (org.apache.hadoop.hive.metastore.api.StorageDescriptor)2 AlterTableExchangePartition (org.apache.hadoop.hive.ql.plan.AlterTableExchangePartition)2 ThreadFactoryBuilder (com.google.common.util.concurrent.ThreadFactoryBuilder)1 FileNotFoundException (java.io.FileNotFoundException)1 IOException (java.io.IOException)1