Examples with StatsWork - org.apache.hadoop.hive.ql.plan.StatsWork

Example 1 with StatsWork

use of org.apache.hadoop.hive.ql.plan.StatsWork in project hive by apache.

the class GenMapRedUtils method addStatsTask.

/**
   * Add the StatsTask as a dependent task of the MoveTask
   * because StatsTask will change the Table/Partition metadata. For atomicity, we
   * should not change it before the data is actually there done by MoveTask.
   *
   * @param nd
   *          the FileSinkOperator whose results are taken care of by the MoveTask.
   * @param mvTask
   *          The MoveTask that moves the FileSinkOperator's results.
   * @param currTask
   *          The MapRedTask that the FileSinkOperator belongs to.
   * @param hconf
   *          HiveConf
   */
public static void addStatsTask(FileSinkOperator nd, MoveTask mvTask, Task<? extends Serializable> currTask, HiveConf hconf) {
    MoveWork mvWork = mvTask.getWork();
    StatsWork statsWork = null;
    if (mvWork.getLoadTableWork() != null) {
        statsWork = new StatsWork(mvWork.getLoadTableWork());
    } else if (mvWork.getLoadFileWork() != null) {
        statsWork = new StatsWork(mvWork.getLoadFileWork());
    }
    assert statsWork != null : "Error when generating StatsTask";
    statsWork.setSourceTask(currTask);
    statsWork.setStatsReliable(hconf.getBoolVar(ConfVars.HIVE_STATS_RELIABLE));
    statsWork.setStatsTmpDir(nd.getConf().getStatsTmpDir());
    if (currTask.getWork() instanceof MapredWork) {
        MapredWork mrWork = (MapredWork) currTask.getWork();
        mrWork.getMapWork().setGatheringStats(true);
        if (mrWork.getReduceWork() != null) {
            mrWork.getReduceWork().setGatheringStats(true);
        }
    } else if (currTask.getWork() instanceof SparkWork) {
        SparkWork work = (SparkWork) currTask.getWork();
        for (BaseWork w : work.getAllWork()) {
            w.setGatheringStats(true);
        }
    } else {
        // must be TezWork
        TezWork work = (TezWork) currTask.getWork();
        for (BaseWork w : work.getAllWork()) {
            w.setGatheringStats(true);
        }
    }
    // AggKey in StatsWork is used for stats aggregation while StatsAggPrefix
    // in FileSinkDesc is used for stats publishing. They should be consistent.
    statsWork.setAggKey(nd.getConf().getStatsAggPrefix());
    Task<? extends Serializable> statsTask = TaskFactory.get(statsWork, hconf);
    // subscribe feeds from the MoveTask so that MoveTask can forward the list
    // of dynamic partition list to the StatsTask
    mvTask.addDependentTask(statsTask);
    statsTask.subscribeFeed(mvTask);
}

Also used : MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) TezWork(org.apache.hadoop.hive.ql.plan.TezWork)

Example 2 with StatsWork

use of org.apache.hadoop.hive.ql.plan.StatsWork in project hive by apache.

the class DDLSemanticAnalyzer method analyzeAlterTablePartMergeFiles.

private void analyzeAlterTablePartMergeFiles(ASTNode ast, String tableName, HashMap<String, String> partSpec) throws SemanticException {
    AlterTablePartMergeFilesDesc mergeDesc = new AlterTablePartMergeFilesDesc(tableName, partSpec);
    List<Path> inputDir = new ArrayList<Path>();
    Path oldTblPartLoc = null;
    Path newTblPartLoc = null;
    Table tblObj = null;
    ListBucketingCtx lbCtx = null;
    try {
        tblObj = getTable(tableName);
        List<String> bucketCols = null;
        Class<? extends InputFormat> inputFormatClass = null;
        boolean isArchived = false;
        boolean checkIndex = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVE_CONCATENATE_CHECK_INDEX);
        if (checkIndex) {
            List<Index> indexes = db.getIndexes(tblObj.getDbName(), tblObj.getTableName(), Short.MAX_VALUE);
            if (indexes != null && indexes.size() > 0) {
                throw new SemanticException("can not do merge because source table " + tableName + " is indexed.");
            }
        }
        if (tblObj.isPartitioned()) {
            if (partSpec == null) {
                throw new SemanticException("source table " + tableName + " is partitioned but no partition desc found.");
            } else {
                Partition part = getPartition(tblObj, partSpec, false);
                if (part == null) {
                    throw new SemanticException("source table " + tableName + " is partitioned but partition not found.");
                }
                bucketCols = part.getBucketCols();
                inputFormatClass = part.getInputFormatClass();
                isArchived = ArchiveUtils.isArchived(part);
                Path tabPath = tblObj.getPath();
                Path partPath = part.getDataLocation();
                // if the table is in a different dfs than the partition,
                // replace the partition's dfs with the table's dfs.
                newTblPartLoc = new Path(tabPath.toUri().getScheme(), tabPath.toUri().getAuthority(), partPath.toUri().getPath());
                oldTblPartLoc = partPath;
                lbCtx = constructListBucketingCtx(part.getSkewedColNames(), part.getSkewedColValues(), part.getSkewedColValueLocationMaps(), part.isStoredAsSubDirectories(), conf);
            }
        } else {
            inputFormatClass = tblObj.getInputFormatClass();
            bucketCols = tblObj.getBucketCols();
            // input and output are the same
            oldTblPartLoc = tblObj.getPath();
            newTblPartLoc = tblObj.getPath();
            lbCtx = constructListBucketingCtx(tblObj.getSkewedColNames(), tblObj.getSkewedColValues(), tblObj.getSkewedColValueLocationMaps(), tblObj.isStoredAsSubDirectories(), conf);
        }
        // throw a HiveException for other than rcfile and orcfile.
        if (!((inputFormatClass.equals(RCFileInputFormat.class) || (inputFormatClass.equals(OrcInputFormat.class))))) {
            throw new SemanticException("Only RCFile and ORCFile Formats are supported right now.");
        }
        mergeDesc.setInputFormatClass(inputFormatClass);
        // throw a HiveException if the table/partition is bucketized
        if (bucketCols != null && bucketCols.size() > 0) {
            throw new SemanticException("Merge can not perform on bucketized partition/table.");
        }
        // throw a HiveException if the table/partition is archived
        if (isArchived) {
            throw new SemanticException("Merge can not perform on archived partitions.");
        }
        inputDir.add(oldTblPartLoc);
        mergeDesc.setInputDir(inputDir);
        mergeDesc.setLbCtx(lbCtx);
        addInputsOutputsAlterTable(tableName, partSpec, AlterTableTypes.MERGEFILES);
        DDLWork ddlWork = new DDLWork(getInputs(), getOutputs(), mergeDesc);
        ddlWork.setNeedLock(true);
        Task<? extends Serializable> mergeTask = TaskFactory.get(ddlWork, conf);
        TableDesc tblDesc = Utilities.getTableDesc(tblObj);
        Path queryTmpdir = ctx.getExternalTmpPath(newTblPartLoc);
        mergeDesc.setOutputDir(queryTmpdir);
        LoadTableDesc ltd = new LoadTableDesc(queryTmpdir, tblDesc, partSpec == null ? new HashMap<String, String>() : partSpec);
        ltd.setLbCtx(lbCtx);
        Task<MoveWork> moveTsk = TaskFactory.get(new MoveWork(null, null, ltd, null, false), conf);
        mergeTask.addDependentTask(moveTsk);
        if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
            StatsWork statDesc;
            if (oldTblPartLoc.equals(newTblPartLoc)) {
                // If we're merging to the same location, we can avoid some metastore calls
                TableSpec tablepart = new TableSpec(db, conf, tableName, partSpec);
                statDesc = new StatsWork(tablepart);
            } else {
                statDesc = new StatsWork(ltd);
            }
            statDesc.setNoStatsAggregator(true);
            statDesc.setClearAggregatorStats(true);
            statDesc.setStatsReliable(conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
            Task<? extends Serializable> statTask = TaskFactory.get(statDesc, conf);
            moveTsk.addDependentTask(statTask);
        }
        rootTasks.add(mergeTask);
    } catch (Exception e) {
        throw new SemanticException(e);
    }
}

Also used : MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) HiveIndex(org.apache.hadoop.hive.ql.index.HiveIndex) Index(org.apache.hadoop.hive.metastore.api.Index) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) ListBucketingCtx(org.apache.hadoop.hive.ql.plan.ListBucketingCtx) Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.ql.metadata.Partition) AlterTableExchangePartition(org.apache.hadoop.hive.ql.plan.AlterTableExchangePartition) Table(org.apache.hadoop.hive.ql.metadata.Table) LockException(org.apache.hadoop.hive.ql.lockmgr.LockException) InvocationTargetException(java.lang.reflect.InvocationTargetException) NoSuchObjectException(org.apache.hadoop.hive.metastore.api.NoSuchObjectException) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) URISyntaxException(java.net.URISyntaxException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) InvalidTableException(org.apache.hadoop.hive.ql.metadata.InvalidTableException) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) DDLWork(org.apache.hadoop.hive.ql.plan.DDLWork) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) DescTableDesc(org.apache.hadoop.hive.ql.plan.DescTableDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) AlterTableDesc(org.apache.hadoop.hive.ql.plan.AlterTableDesc) UnlockTableDesc(org.apache.hadoop.hive.ql.plan.UnlockTableDesc) DropTableDesc(org.apache.hadoop.hive.ql.plan.DropTableDesc) ShowCreateTableDesc(org.apache.hadoop.hive.ql.plan.ShowCreateTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) LockTableDesc(org.apache.hadoop.hive.ql.plan.LockTableDesc) TruncateTableDesc(org.apache.hadoop.hive.ql.plan.TruncateTableDesc)

Example 3 with StatsWork

use of org.apache.hadoop.hive.ql.plan.StatsWork in project hive by apache.

the class SparkProcessAnalyzeTable method process.

@SuppressWarnings("unchecked")
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
    GenSparkProcContext context = (GenSparkProcContext) procContext;
    TableScanOperator tableScan = (TableScanOperator) nd;
    ParseContext parseContext = context.parseContext;
    @SuppressWarnings("rawtypes") Class<? extends InputFormat> inputFormat = tableScan.getConf().getTableMetadata().getInputFormatClass();
    if (parseContext.getQueryProperties().isAnalyzeCommand()) {
        Preconditions.checkArgument(tableScan.getChildOperators() == null || tableScan.getChildOperators().size() == 0, "AssertionError: expected tableScan.getChildOperators() to be null, " + "or tableScan.getChildOperators().size() to be 0");
        String alias = null;
        for (String a : parseContext.getTopOps().keySet()) {
            if (tableScan == parseContext.getTopOps().get(a)) {
                alias = a;
            }
        }
        Preconditions.checkArgument(alias != null, "AssertionError: expected alias to be not null");
        SparkWork sparkWork = context.currentTask.getWork();
        boolean partialScan = parseContext.getQueryProperties().isPartialScanAnalyzeCommand();
        boolean noScan = parseContext.getQueryProperties().isNoScanAnalyzeCommand();
        if (inputFormat.equals(OrcInputFormat.class) && (noScan || partialScan)) {
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
            // There will not be any Spark job above this task
            StatsNoJobWork snjWork = new StatsNoJobWork(tableScan.getConf().getTableMetadata().getTableSpec());
            snjWork.setStatsReliable(parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
            Task<StatsNoJobWork> snjTask = TaskFactory.get(snjWork, parseContext.getConf());
            snjTask.setParentTasks(null);
            context.rootTasks.remove(context.currentTask);
            context.rootTasks.add(snjTask);
            return true;
        } else {
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
            // The plan consists of a simple SparkTask followed by a StatsTask.
            // The Spark task is just a simple TableScanOperator
            StatsWork statsWork = new StatsWork(tableScan.getConf().getTableMetadata().getTableSpec());
            statsWork.setAggKey(tableScan.getConf().getStatsAggPrefix());
            statsWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir());
            statsWork.setSourceTask(context.currentTask);
            statsWork.setStatsReliable(parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
            Task<StatsWork> statsTask = TaskFactory.get(statsWork, parseContext.getConf());
            context.currentTask.addDependentTask(statsTask);
            // The plan consists of a StatsTask only.
            if (parseContext.getQueryProperties().isNoScanAnalyzeCommand()) {
                statsTask.setParentTasks(null);
                statsWork.setNoScanAnalyzeCommand(true);
                context.rootTasks.remove(context.currentTask);
                context.rootTasks.add(statsTask);
            }
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
            if (parseContext.getQueryProperties().isPartialScanAnalyzeCommand()) {
                handlePartialScanCommand(tableScan, parseContext, statsWork, context, statsTask);
            }
            // NOTE: here we should use the new partition predicate pushdown API to get a list of pruned list,
            // and pass it to setTaskPlan as the last parameter
            Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
            PrunedPartitionList partitions = null;
            if (confirmedPartns.size() > 0) {
                Table source = tableScan.getConf().getTableMetadata();
                List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
                partitions = new PrunedPartitionList(source, confirmedPartns, partCols, false);
            }
            MapWork w = utils.createMapWork(context, tableScan, sparkWork, partitions);
            w.setGatheringStats(true);
            return true;
        }
    }
    return null;
}

Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) StatsNoJobWork(org.apache.hadoop.hive.ql.plan.StatsNoJobWork)

Example 4 with StatsWork

use of org.apache.hadoop.hive.ql.plan.StatsWork in project hive by apache.

the class GenMRTableScan1 method process.

/**
   * Table Sink encountered.
   * @param nd
   *          the table sink operator encountered
   * @param opProcCtx
   *          context
   */
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    TableScanOperator op = (TableScanOperator) nd;
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    ParseContext parseCtx = ctx.getParseCtx();
    Class<? extends InputFormat> inputFormat = op.getConf().getTableMetadata().getInputFormatClass();
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
    // create a dummy MapReduce task
    MapredWork currWork = GenMapRedUtils.getMapRedWork(parseCtx);
    MapRedTask currTask = (MapRedTask) TaskFactory.get(currWork, parseCtx.getConf());
    ctx.setCurrTask(currTask);
    ctx.setCurrTopOp(op);
    for (String alias : parseCtx.getTopOps().keySet()) {
        Operator<? extends OperatorDesc> currOp = parseCtx.getTopOps().get(alias);
        if (currOp == op) {
            String currAliasId = alias;
            ctx.setCurrAliasId(currAliasId);
            mapCurrCtx.put(op, new GenMapRedCtx(currTask, currAliasId));
            if (parseCtx.getQueryProperties().isAnalyzeCommand()) {
                boolean partialScan = parseCtx.getQueryProperties().isPartialScanAnalyzeCommand();
                boolean noScan = parseCtx.getQueryProperties().isNoScanAnalyzeCommand();
                if (OrcInputFormat.class.isAssignableFrom(inputFormat) || MapredParquetInputFormat.class.isAssignableFrom(inputFormat)) {
                    // For ORC and Parquet, all the following statements are the same
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
                    // There will not be any MR or Tez job above this task
                    StatsNoJobWork snjWork = new StatsNoJobWork(op.getConf().getTableMetadata().getTableSpec());
                    snjWork.setStatsReliable(parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
                    // If partition is specified, get pruned partition list
                    Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(op);
                    if (confirmedParts.size() > 0) {
                        Table source = op.getConf().getTableMetadata();
                        List<String> partCols = GenMapRedUtils.getPartitionColumns(op);
                        PrunedPartitionList partList = new PrunedPartitionList(source, confirmedParts, partCols, false);
                        snjWork.setPrunedPartitionList(partList);
                    }
                    Task<StatsNoJobWork> snjTask = TaskFactory.get(snjWork, parseCtx.getConf());
                    ctx.setCurrTask(snjTask);
                    ctx.setCurrTopOp(null);
                    ctx.getRootTasks().clear();
                    ctx.getRootTasks().add(snjTask);
                } else {
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
                    // The plan consists of a simple MapRedTask followed by a StatsTask.
                    // The MR task is just a simple TableScanOperator
                    StatsWork statsWork = new StatsWork(op.getConf().getTableMetadata().getTableSpec());
                    statsWork.setAggKey(op.getConf().getStatsAggPrefix());
                    statsWork.setStatsTmpDir(op.getConf().getTmpStatsDir());
                    statsWork.setSourceTask(currTask);
                    statsWork.setStatsReliable(parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
                    Task<StatsWork> statsTask = TaskFactory.get(statsWork, parseCtx.getConf());
                    currTask.addDependentTask(statsTask);
                    if (!ctx.getRootTasks().contains(currTask)) {
                        ctx.getRootTasks().add(currTask);
                    }
                    // The plan consists of a StatsTask only.
                    if (noScan) {
                        statsTask.setParentTasks(null);
                        statsWork.setNoScanAnalyzeCommand(true);
                        ctx.getRootTasks().remove(currTask);
                        ctx.getRootTasks().add(statsTask);
                    }
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
                    if (partialScan) {
                        handlePartialScanCommand(op, ctx, parseCtx, currTask, statsWork, statsTask);
                    }
                    currWork.getMapWork().setGatheringStats(true);
                    if (currWork.getReduceWork() != null) {
                        currWork.getReduceWork().setGatheringStats(true);
                    }
                    // NOTE: here we should use the new partition predicate pushdown API to get a list of
                    // pruned list,
                    // and pass it to setTaskPlan as the last parameter
                    Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(op);
                    if (confirmedPartns.size() > 0) {
                        Table source = op.getConf().getTableMetadata();
                        List<String> partCols = GenMapRedUtils.getPartitionColumns(op);
                        PrunedPartitionList partList = new PrunedPartitionList(source, confirmedPartns, partCols, false);
                        GenMapRedUtils.setTaskPlan(currAliasId, op, currTask, false, ctx, partList);
                    } else {
                        // non-partitioned table
                        GenMapRedUtils.setTaskPlan(currAliasId, op, currTask, false, ctx);
                    }
                }
            }
            return true;
        }
    }
    assert false;
    return null;
}

Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapredParquetInputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) StatsNoJobWork(org.apache.hadoop.hive.ql.plan.StatsNoJobWork) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 5 with StatsWork

use of org.apache.hadoop.hive.ql.plan.StatsWork in project hive by apache.

the class DDLSemanticAnalyzer method analyzeTruncateTable.

private void analyzeTruncateTable(ASTNode ast) throws SemanticException {
    // TOK_TABLE_PARTITION
    ASTNode root = (ASTNode) ast.getChild(0);
    String tableName = getUnescapedName((ASTNode) root.getChild(0));
    Table table = getTable(tableName, true);
    if (table.getTableType() != TableType.MANAGED_TABLE) {
        throw new SemanticException(ErrorMsg.TRUNCATE_FOR_NON_MANAGED_TABLE.format(tableName));
    }
    if (table.isNonNative()) {
        //TODO
        throw new SemanticException(ErrorMsg.TRUNCATE_FOR_NON_NATIVE_TABLE.format(tableName));
    }
    if (!table.isPartitioned() && root.getChildCount() > 1) {
        throw new SemanticException(ErrorMsg.PARTSPEC_FOR_NON_PARTITIONED_TABLE.format(tableName));
    }
    Map<String, String> partSpec = getPartSpec((ASTNode) root.getChild(1));
    if (partSpec == null) {
        if (!table.isPartitioned()) {
            outputs.add(new WriteEntity(table, WriteEntity.WriteType.DDL_EXCLUSIVE));
        } else {
            for (Partition partition : getPartitions(table, null, false)) {
                outputs.add(new WriteEntity(partition, WriteEntity.WriteType.DDL_EXCLUSIVE));
            }
        }
    } else {
        if (isFullSpec(table, partSpec)) {
            validatePartSpec(table, partSpec, (ASTNode) root.getChild(1), conf, true);
            Partition partition = getPartition(table, partSpec, true);
            outputs.add(new WriteEntity(partition, WriteEntity.WriteType.DDL_EXCLUSIVE));
        } else {
            validatePartSpec(table, partSpec, (ASTNode) root.getChild(1), conf, false);
            for (Partition partition : getPartitions(table, partSpec, false)) {
                outputs.add(new WriteEntity(partition, WriteEntity.WriteType.DDL_EXCLUSIVE));
            }
        }
    }
    TruncateTableDesc truncateTblDesc = new TruncateTableDesc(tableName, partSpec);
    DDLWork ddlWork = new DDLWork(getInputs(), getOutputs(), truncateTblDesc);
    Task<? extends Serializable> truncateTask = TaskFactory.get(ddlWork, conf);
    // Is this a truncate column command
    List<String> columnNames = null;
    if (ast.getChildCount() == 2) {
        try {
            columnNames = getColumnNames((ASTNode) ast.getChild(1));
            // Throw an error if the table is indexed
            List<Index> indexes = db.getIndexes(table.getDbName(), tableName, (short) 1);
            if (indexes != null && indexes.size() > 0) {
                throw new SemanticException(ErrorMsg.TRUNCATE_COLUMN_INDEXED_TABLE.getMsg());
            }
            List<String> bucketCols = null;
            Class<? extends InputFormat> inputFormatClass = null;
            boolean isArchived = false;
            Path newTblPartLoc = null;
            Path oldTblPartLoc = null;
            List<FieldSchema> cols = null;
            ListBucketingCtx lbCtx = null;
            boolean isListBucketed = false;
            List<String> listBucketColNames = null;
            if (table.isPartitioned()) {
                Partition part = db.getPartition(table, partSpec, false);
                Path tabPath = table.getPath();
                Path partPath = part.getDataLocation();
                // if the table is in a different dfs than the partition,
                // replace the partition's dfs with the table's dfs.
                newTblPartLoc = new Path(tabPath.toUri().getScheme(), tabPath.toUri().getAuthority(), partPath.toUri().getPath());
                oldTblPartLoc = partPath;
                cols = part.getCols();
                bucketCols = part.getBucketCols();
                inputFormatClass = part.getInputFormatClass();
                isArchived = ArchiveUtils.isArchived(part);
                lbCtx = constructListBucketingCtx(part.getSkewedColNames(), part.getSkewedColValues(), part.getSkewedColValueLocationMaps(), part.isStoredAsSubDirectories(), conf);
                isListBucketed = part.isStoredAsSubDirectories();
                listBucketColNames = part.getSkewedColNames();
            } else {
                // input and output are the same
                oldTblPartLoc = table.getPath();
                newTblPartLoc = table.getPath();
                cols = table.getCols();
                bucketCols = table.getBucketCols();
                inputFormatClass = table.getInputFormatClass();
                lbCtx = constructListBucketingCtx(table.getSkewedColNames(), table.getSkewedColValues(), table.getSkewedColValueLocationMaps(), table.isStoredAsSubDirectories(), conf);
                isListBucketed = table.isStoredAsSubDirectories();
                listBucketColNames = table.getSkewedColNames();
            }
            // throw a HiveException for non-rcfile.
            if (!inputFormatClass.equals(RCFileInputFormat.class)) {
                throw new SemanticException(ErrorMsg.TRUNCATE_COLUMN_NOT_RC.getMsg());
            }
            // throw a HiveException if the table/partition is archived
            if (isArchived) {
                throw new SemanticException(ErrorMsg.TRUNCATE_COLUMN_ARCHIVED.getMsg());
            }
            Set<Integer> columnIndexes = new HashSet<Integer>();
            for (String columnName : columnNames) {
                boolean found = false;
                for (int columnIndex = 0; columnIndex < cols.size(); columnIndex++) {
                    if (columnName.equalsIgnoreCase(cols.get(columnIndex).getName())) {
                        columnIndexes.add(columnIndex);
                        found = true;
                        break;
                    }
                }
                // Throw an exception if the user is trying to truncate a column which doesn't exist
                if (!found) {
                    throw new SemanticException(ErrorMsg.INVALID_COLUMN.getMsg(columnName));
                }
                // Throw an exception if the table/partition is bucketed on one of the columns
                for (String bucketCol : bucketCols) {
                    if (bucketCol.equalsIgnoreCase(columnName)) {
                        throw new SemanticException(ErrorMsg.TRUNCATE_BUCKETED_COLUMN.getMsg(columnName));
                    }
                }
                if (isListBucketed) {
                    for (String listBucketCol : listBucketColNames) {
                        if (listBucketCol.equalsIgnoreCase(columnName)) {
                            throw new SemanticException(ErrorMsg.TRUNCATE_LIST_BUCKETED_COLUMN.getMsg(columnName));
                        }
                    }
                }
            }
            truncateTblDesc.setColumnIndexes(new ArrayList<Integer>(columnIndexes));
            truncateTblDesc.setInputDir(oldTblPartLoc);
            truncateTblDesc.setLbCtx(lbCtx);
            addInputsOutputsAlterTable(tableName, partSpec, AlterTableTypes.TRUNCATE);
            ddlWork.setNeedLock(true);
            TableDesc tblDesc = Utilities.getTableDesc(table);
            // Write the output to temporary directory and move it to the final location at the end
            // so the operation is atomic.
            Path queryTmpdir = ctx.getExternalTmpPath(newTblPartLoc);
            truncateTblDesc.setOutputDir(queryTmpdir);
            LoadTableDesc ltd = new LoadTableDesc(queryTmpdir, tblDesc, partSpec == null ? new HashMap<String, String>() : partSpec);
            ltd.setLbCtx(lbCtx);
            Task<MoveWork> moveTsk = TaskFactory.get(new MoveWork(null, null, ltd, null, false), conf);
            truncateTask.addDependentTask(moveTsk);
            // Recalculate the HDFS stats if auto gather stats is set
            if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
                StatsWork statDesc;
                if (oldTblPartLoc.equals(newTblPartLoc)) {
                    // If we're merging to the same location, we can avoid some metastore calls
                    TableSpec tablepart = new TableSpec(this.db, conf, root);
                    statDesc = new StatsWork(tablepart);
                } else {
                    statDesc = new StatsWork(ltd);
                }
                statDesc.setNoStatsAggregator(true);
                statDesc.setClearAggregatorStats(true);
                statDesc.setStatsReliable(conf.getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
                Task<? extends Serializable> statTask = TaskFactory.get(statDesc, conf);
                moveTsk.addDependentTask(statTask);
            }
        } catch (HiveException e) {
            throw new SemanticException(e);
        }
    }
    rootTasks.add(truncateTask);
}

Also used : MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) FieldSchema(org.apache.hadoop.hive.metastore.api.FieldSchema) HiveIndex(org.apache.hadoop.hive.ql.index.HiveIndex) Index(org.apache.hadoop.hive.metastore.api.Index) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) ListBucketingCtx(org.apache.hadoop.hive.ql.plan.ListBucketingCtx) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity) HashSet(java.util.HashSet) Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.ql.metadata.Partition) AlterTableExchangePartition(org.apache.hadoop.hive.ql.plan.AlterTableExchangePartition) Table(org.apache.hadoop.hive.ql.metadata.Table) TruncateTableDesc(org.apache.hadoop.hive.ql.plan.TruncateTableDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) DDLWork(org.apache.hadoop.hive.ql.plan.DDLWork) RCFileInputFormat(org.apache.hadoop.hive.ql.io.RCFileInputFormat) DescTableDesc(org.apache.hadoop.hive.ql.plan.DescTableDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) AlterTableDesc(org.apache.hadoop.hive.ql.plan.AlterTableDesc) UnlockTableDesc(org.apache.hadoop.hive.ql.plan.UnlockTableDesc) DropTableDesc(org.apache.hadoop.hive.ql.plan.DropTableDesc) ShowCreateTableDesc(org.apache.hadoop.hive.ql.plan.ShowCreateTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) LockTableDesc(org.apache.hadoop.hive.ql.plan.LockTableDesc) TruncateTableDesc(org.apache.hadoop.hive.ql.plan.TruncateTableDesc)

Aggregations

StatsWork (org.apache.hadoop.hive.ql.plan.StatsWork)8 Partition (org.apache.hadoop.hive.ql.metadata.Partition)7 Table (org.apache.hadoop.hive.ql.metadata.Table)6 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)4 MoveWork (org.apache.hadoop.hive.ql.plan.MoveWork)4 StatsNoJobWork (org.apache.hadoop.hive.ql.plan.StatsNoJobWork)4 Path (org.apache.hadoop.fs.Path)3 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)3 WriteEntity (org.apache.hadoop.hive.ql.hooks.WriteEntity)3 OrcInputFormat (org.apache.hadoop.hive.ql.io.orc.OrcInputFormat)3 LoadTableDesc (org.apache.hadoop.hive.ql.plan.LoadTableDesc)3 URISyntaxException (java.net.URISyntaxException)2 ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 LinkedHashMap (java.util.LinkedHashMap)2 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)2 Index (org.apache.hadoop.hive.metastore.api.Index)2 HiveIndex (org.apache.hadoop.hive.ql.index.HiveIndex)2 ParseContext (org.apache.hadoop.hive.ql.parse.ParseContext)2 PrunedPartitionList (org.apache.hadoop.hive.ql.parse.PrunedPartitionList)2