Search in sources :

Example 1 with BasicStatsWork

use of org.apache.hadoop.hive.ql.plan.BasicStatsWork in project hive by apache.

the class DDLSemanticAnalyzer method analyzeAlterTablePartMergeFiles.

private void analyzeAlterTablePartMergeFiles(ASTNode ast, String tableName, HashMap<String, String> partSpec) throws SemanticException {
    AlterTablePartMergeFilesDesc mergeDesc = new AlterTablePartMergeFilesDesc(tableName, partSpec);
    List<Path> inputDir = new ArrayList<Path>();
    Path oldTblPartLoc = null;
    Path newTblPartLoc = null;
    Table tblObj = null;
    ListBucketingCtx lbCtx = null;
    try {
        tblObj = getTable(tableName);
        // TODO: we should probably block all ACID tables here.
        if (AcidUtils.isInsertOnlyTable(tblObj.getParameters())) {
            throw new SemanticException("Merge is not supported for MM tables");
        }
        mergeDesc.setTableDesc(Utilities.getTableDesc(tblObj));
        List<String> bucketCols = null;
        Class<? extends InputFormat> inputFormatClass = null;
        boolean isArchived = false;
        if (tblObj.isPartitioned()) {
            if (partSpec == null) {
                throw new SemanticException("source table " + tableName + " is partitioned but no partition desc found.");
            } else {
                Partition part = getPartition(tblObj, partSpec, false);
                if (part == null) {
                    throw new SemanticException("source table " + tableName + " is partitioned but partition not found.");
                }
                bucketCols = part.getBucketCols();
                inputFormatClass = part.getInputFormatClass();
                isArchived = ArchiveUtils.isArchived(part);
                Path tabPath = tblObj.getPath();
                Path partPath = part.getDataLocation();
                // if the table is in a different dfs than the partition,
                // replace the partition's dfs with the table's dfs.
                newTblPartLoc = new Path(tabPath.toUri().getScheme(), tabPath.toUri().getAuthority(), partPath.toUri().getPath());
                oldTblPartLoc = partPath;
                lbCtx = constructListBucketingCtx(part.getSkewedColNames(), part.getSkewedColValues(), part.getSkewedColValueLocationMaps(), part.isStoredAsSubDirectories(), conf);
            }
        } else {
            inputFormatClass = tblObj.getInputFormatClass();
            bucketCols = tblObj.getBucketCols();
            // input and output are the same
            oldTblPartLoc = tblObj.getPath();
            newTblPartLoc = tblObj.getPath();
            lbCtx = constructListBucketingCtx(tblObj.getSkewedColNames(), tblObj.getSkewedColValues(), tblObj.getSkewedColValueLocationMaps(), tblObj.isStoredAsSubDirectories(), conf);
        }
        // throw a HiveException for other than rcfile and orcfile.
        if (!((inputFormatClass.equals(RCFileInputFormat.class) || (inputFormatClass.equals(OrcInputFormat.class))))) {
            throw new SemanticException(ErrorMsg.CONCATENATE_UNSUPPORTED_FILE_FORMAT.getMsg());
        }
        mergeDesc.setInputFormatClass(inputFormatClass);
        // throw a HiveException if the table/partition is bucketized
        if (bucketCols != null && bucketCols.size() > 0) {
            throw new SemanticException(ErrorMsg.CONCATENATE_UNSUPPORTED_TABLE_BUCKETED.getMsg());
        }
        // throw a HiveException if the table/partition is archived
        if (isArchived) {
            throw new SemanticException(ErrorMsg.CONCATENATE_UNSUPPORTED_PARTITION_ARCHIVED.getMsg());
        }
        // violating which can cause data loss
        if (tblObj.isNonNative()) {
            throw new SemanticException(ErrorMsg.CONCATENATE_UNSUPPORTED_TABLE_NON_NATIVE.getMsg());
        }
        if (tblObj.getTableType() != TableType.MANAGED_TABLE) {
            throw new SemanticException(ErrorMsg.CONCATENATE_UNSUPPORTED_TABLE_NOT_MANAGED.getMsg());
        }
        // transactional tables are compacted and no longer needs to be bucketed, so not safe for merge/concatenation
        boolean isAcid = AcidUtils.isTransactionalTable(tblObj);
        if (isAcid) {
            throw new SemanticException(ErrorMsg.CONCATENATE_UNSUPPORTED_TABLE_TRANSACTIONAL.getMsg());
        }
        inputDir.add(oldTblPartLoc);
        mergeDesc.setInputDir(inputDir);
        mergeDesc.setLbCtx(lbCtx);
        addInputsOutputsAlterTable(tableName, partSpec, AlterTableTypes.MERGEFILES);
        DDLWork ddlWork = new DDLWork(getInputs(), getOutputs(), mergeDesc);
        ddlWork.setNeedLock(true);
        Task<? extends Serializable> mergeTask = TaskFactory.get(ddlWork);
        TableDesc tblDesc = Utilities.getTableDesc(tblObj);
        Path queryTmpdir = ctx.getExternalTmpPath(newTblPartLoc);
        mergeDesc.setOutputDir(queryTmpdir);
        // No need to handle MM tables - unsupported path.
        LoadTableDesc ltd = new LoadTableDesc(queryTmpdir, tblDesc, partSpec == null ? new HashMap<>() : partSpec);
        ltd.setLbCtx(lbCtx);
        Task<MoveWork> moveTsk = TaskFactory.get(new MoveWork(null, null, ltd, null, false));
        mergeTask.addDependentTask(moveTsk);
        if (conf.getBoolVar(HiveConf.ConfVars.HIVESTATSAUTOGATHER)) {
            BasicStatsWork basicStatsWork;
            if (oldTblPartLoc.equals(newTblPartLoc)) {
                // If we're merging to the same location, we can avoid some metastore calls
                TableSpec tableSpec = new TableSpec(db, tableName, partSpec);
                basicStatsWork = new BasicStatsWork(tableSpec);
            } else {
                basicStatsWork = new BasicStatsWork(ltd);
            }
            basicStatsWork.setNoStatsAggregator(true);
            basicStatsWork.setClearAggregatorStats(true);
            StatsWork columnStatsWork = new StatsWork(tblObj, basicStatsWork, conf);
            Task<? extends Serializable> statTask = TaskFactory.get(columnStatsWork);
            moveTsk.addDependentTask(statTask);
        }
        rootTasks.add(mergeTask);
    } catch (Exception e) {
        throw new SemanticException(e);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) Partition(org.apache.hadoop.hive.ql.metadata.Partition) AlterTableExchangePartition(org.apache.hadoop.hive.ql.plan.AlterTableExchangePartition) Table(org.apache.hadoop.hive.ql.metadata.Table) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) LockException(org.apache.hadoop.hive.ql.lockmgr.LockException) InvocationTargetException(java.lang.reflect.InvocationTargetException) MetaException(org.apache.hadoop.hive.metastore.api.MetaException) URISyntaxException(java.net.URISyntaxException) FileNotFoundException(java.io.FileNotFoundException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) InvalidTableException(org.apache.hadoop.hive.ql.metadata.InvalidTableException) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) DDLWork(org.apache.hadoop.hive.ql.plan.DDLWork) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) ListBucketingCtx(org.apache.hadoop.hive.ql.plan.ListBucketingCtx) DescTableDesc(org.apache.hadoop.hive.ql.plan.DescTableDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) AlterTableDesc(org.apache.hadoop.hive.ql.plan.AlterTableDesc) UnlockTableDesc(org.apache.hadoop.hive.ql.plan.UnlockTableDesc) DropTableDesc(org.apache.hadoop.hive.ql.plan.DropTableDesc) ShowCreateTableDesc(org.apache.hadoop.hive.ql.plan.ShowCreateTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) LockTableDesc(org.apache.hadoop.hive.ql.plan.LockTableDesc) TruncateTableDesc(org.apache.hadoop.hive.ql.plan.TruncateTableDesc) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork)

Example 2 with BasicStatsWork

use of org.apache.hadoop.hive.ql.plan.BasicStatsWork in project hive by apache.

the class SparkProcessAnalyzeTable method process.

@SuppressWarnings("unchecked")
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
    GenSparkProcContext context = (GenSparkProcContext) procContext;
    TableScanOperator tableScan = (TableScanOperator) nd;
    ParseContext parseContext = context.parseContext;
    Table table = tableScan.getConf().getTableMetadata();
    @SuppressWarnings("rawtypes") Class<? extends InputFormat> inputFormat = table.getInputFormatClass();
    if (parseContext.getQueryProperties().isAnalyzeCommand()) {
        Preconditions.checkArgument(tableScan.getChildOperators() == null || tableScan.getChildOperators().size() == 0, "AssertionError: expected tableScan.getChildOperators() to be null, " + "or tableScan.getChildOperators().size() to be 0");
        String alias = null;
        for (String a : parseContext.getTopOps().keySet()) {
            if (tableScan == parseContext.getTopOps().get(a)) {
                alias = a;
            }
        }
        Preconditions.checkArgument(alias != null, "AssertionError: expected alias to be not null");
        SparkWork sparkWork = context.currentTask.getWork();
        if (OrcInputFormat.class.isAssignableFrom(inputFormat) || MapredParquetInputFormat.class.isAssignableFrom(inputFormat)) {
            // For ORC & Parquet, all the following statements are the same
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
            // There will not be any Spark job above this task
            StatsWork statWork = new StatsWork(table, parseContext.getConf());
            statWork.setFooterScan();
            // If partition is specified, get pruned partition list
            Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
            if (confirmedParts.size() > 0) {
                List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
                PrunedPartitionList partList = new PrunedPartitionList(table, confirmedParts, partCols, false);
                statWork.addInputPartitions(partList.getPartitions());
            }
            Task<StatsWork> snjTask = TaskFactory.get(statWork);
            snjTask.setParentTasks(null);
            context.rootTasks.remove(context.currentTask);
            context.rootTasks.add(snjTask);
            return true;
        } else {
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
            // The plan consists of a simple SparkTask followed by a StatsTask.
            // The Spark task is just a simple TableScanOperator
            BasicStatsWork basicStatsWork = new BasicStatsWork(table.getTableSpec());
            basicStatsWork.setNoScanAnalyzeCommand(parseContext.getQueryProperties().isNoScanAnalyzeCommand());
            StatsWork columnStatsWork = new StatsWork(table, basicStatsWork, parseContext.getConf());
            columnStatsWork.collectStatsFromAggregator(tableScan.getConf());
            columnStatsWork.setSourceTask(context.currentTask);
            Task<StatsWork> statsTask = TaskFactory.get(columnStatsWork);
            context.currentTask.addDependentTask(statsTask);
            // The plan consists of a StatsTask only.
            if (parseContext.getQueryProperties().isNoScanAnalyzeCommand()) {
                statsTask.setParentTasks(null);
                context.rootTasks.remove(context.currentTask);
                context.rootTasks.add(statsTask);
            }
            // NOTE: here we should use the new partition predicate pushdown API to get a list of pruned list,
            // and pass it to setTaskPlan as the last parameter
            Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
            PrunedPartitionList partitions = null;
            if (confirmedPartns.size() > 0) {
                List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
                partitions = new PrunedPartitionList(table, confirmedPartns, partCols, false);
            }
            MapWork w = utils.createMapWork(context, tableScan, sparkWork, partitions);
            w.setGatheringStats(true);
            return true;
        }
    }
    return null;
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) MapredParquetInputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork)

Example 3 with BasicStatsWork

use of org.apache.hadoop.hive.ql.plan.BasicStatsWork in project hive by apache.

the class GenMapRedUtils method addStatsTask.

/**
 * Add the StatsTask as a dependent task of the MoveTask
 * because StatsTask will change the Table/Partition metadata. For atomicity, we
 * should not change it before the data is actually there done by MoveTask.
 *
 * @param nd
 *          the FileSinkOperator whose results are taken care of by the MoveTask.
 * @param mvTask
 *          The MoveTask that moves the FileSinkOperator's results.
 * @param currTask
 *          The MapRedTask that the FileSinkOperator belongs to.
 * @param hconf
 *          HiveConf
 */
public static void addStatsTask(FileSinkOperator nd, MoveTask mvTask, Task<? extends Serializable> currTask, HiveConf hconf) {
    MoveWork mvWork = mvTask.getWork();
    BasicStatsWork statsWork = null;
    Table table = null;
    boolean truncate = false;
    if (mvWork.getLoadTableWork() != null) {
        statsWork = new BasicStatsWork(mvWork.getLoadTableWork());
        String tableName = mvWork.getLoadTableWork().getTable().getTableName();
        truncate = mvWork.getLoadTableWork().getReplace();
        try {
            table = Hive.get().getTable(SessionState.get().getCurrentDatabase(), tableName);
        } catch (HiveException e) {
            throw new RuntimeException("unexpected; table should be present already..: " + tableName, e);
        }
    } else if (mvWork.getLoadFileWork() != null) {
        statsWork = new BasicStatsWork(mvWork.getLoadFileWork());
        truncate = true;
        if (mvWork.getLoadFileWork().getCtasCreateTableDesc() != null) {
            try {
                table = mvWork.getLoadFileWork().getCtasCreateTableDesc().toTable(hconf);
            } catch (HiveException e) {
                LOG.debug("can't pre-create table for CTAS", e);
                table = null;
            }
        } else if (mvWork.getLoadFileWork().getCreateViewDesc() != null) {
            // CREATE MATERIALIZED VIEW ...
            try {
                table = mvWork.getLoadFileWork().getCreateViewDesc().toTable(hconf);
            } catch (HiveException e) {
                LOG.debug("can't pre-create table for MV", e);
                table = null;
            }
        } else {
            throw new RuntimeException("unexpected; this should be a CTAS or a CREATE/REBUILD MV - however no desc present");
        }
    }
    assert statsWork != null : "Error when generating StatsTask";
    if (currTask.getWork() instanceof MapredWork) {
        MapredWork mrWork = (MapredWork) currTask.getWork();
        mrWork.getMapWork().setGatheringStats(true);
        if (mrWork.getReduceWork() != null) {
            mrWork.getReduceWork().setGatheringStats(true);
        }
    } else if (currTask.getWork() instanceof SparkWork) {
        SparkWork work = (SparkWork) currTask.getWork();
        for (BaseWork w : work.getAllWork()) {
            w.setGatheringStats(true);
        }
    } else {
        // must be TezWork
        TezWork work = (TezWork) currTask.getWork();
        for (BaseWork w : work.getAllWork()) {
            w.setGatheringStats(true);
        }
    }
    StatsWork columnStatsWork = new StatsWork(table, statsWork, hconf);
    columnStatsWork.collectStatsFromAggregator(nd.getConf());
    columnStatsWork.truncateExisting(truncate);
    columnStatsWork.setSourceTask(currTask);
    Task<? extends Serializable> statsTask = TaskFactory.get(columnStatsWork);
    // subscribe feeds from the MoveTask so that MoveTask can forward the list
    // of dynamic partition list to the StatsTask
    mvTask.addDependentTask(statsTask);
    statsTask.subscribeFeed(mvTask);
}
Also used : MoveWork(org.apache.hadoop.hive.ql.plan.MoveWork) Table(org.apache.hadoop.hive.ql.metadata.Table) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) TezWork(org.apache.hadoop.hive.ql.plan.TezWork)

Example 4 with BasicStatsWork

use of org.apache.hadoop.hive.ql.plan.BasicStatsWork in project hive by apache.

the class GenMRTableScan1 method process.

/**
 * Table Sink encountered.
 * @param nd
 *          the table sink operator encountered
 * @param opProcCtx
 *          context
 */
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    TableScanOperator op = (TableScanOperator) nd;
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    ParseContext parseCtx = ctx.getParseCtx();
    Table table = op.getConf().getTableMetadata();
    Class<? extends InputFormat> inputFormat = table.getInputFormatClass();
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
    // create a dummy MapReduce task
    MapredWork currWork = GenMapRedUtils.getMapRedWork(parseCtx);
    MapRedTask currTask = (MapRedTask) TaskFactory.get(currWork);
    ctx.setCurrTask(currTask);
    ctx.setCurrTopOp(op);
    for (String alias : parseCtx.getTopOps().keySet()) {
        Operator<? extends OperatorDesc> currOp = parseCtx.getTopOps().get(alias);
        if (currOp == op) {
            String currAliasId = alias;
            ctx.setCurrAliasId(currAliasId);
            mapCurrCtx.put(op, new GenMapRedCtx(currTask, currAliasId));
            if (parseCtx.getQueryProperties().isAnalyzeCommand()) {
                boolean noScan = parseCtx.getQueryProperties().isNoScanAnalyzeCommand();
                if (OrcInputFormat.class.isAssignableFrom(inputFormat) || MapredParquetInputFormat.class.isAssignableFrom(inputFormat)) {
                    // For ORC and Parquet, all the following statements are the same
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
                    // There will not be any MR or Tez job above this task
                    StatsWork statWork = new StatsWork(table, parseCtx.getConf());
                    statWork.setFooterScan();
                    // If partition is specified, get pruned partition list
                    Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(op);
                    if (confirmedParts.size() > 0) {
                        List<String> partCols = GenMapRedUtils.getPartitionColumns(op);
                        PrunedPartitionList partList = new PrunedPartitionList(table, confirmedParts, partCols, false);
                        statWork.addInputPartitions(partList.getPartitions());
                    }
                    Task<StatsWork> snjTask = TaskFactory.get(statWork);
                    ctx.setCurrTask(snjTask);
                    ctx.setCurrTopOp(null);
                    ctx.getRootTasks().clear();
                    ctx.getRootTasks().add(snjTask);
                } else {
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
                    // The plan consists of a simple MapRedTask followed by a StatsTask.
                    // The MR task is just a simple TableScanOperator
                    BasicStatsWork statsWork = new BasicStatsWork(table.getTableSpec());
                    statsWork.setNoScanAnalyzeCommand(noScan);
                    StatsWork columnStatsWork = new StatsWork(table, statsWork, parseCtx.getConf());
                    columnStatsWork.collectStatsFromAggregator(op.getConf());
                    columnStatsWork.setSourceTask(currTask);
                    Task<StatsWork> columnStatsTask = TaskFactory.get(columnStatsWork);
                    currTask.addDependentTask(columnStatsTask);
                    if (!ctx.getRootTasks().contains(currTask)) {
                        ctx.getRootTasks().add(currTask);
                    }
                    // The plan consists of a StatsTask only.
                    if (noScan) {
                        columnStatsTask.setParentTasks(null);
                        ctx.getRootTasks().remove(currTask);
                        ctx.getRootTasks().add(columnStatsTask);
                    }
                    currWork.getMapWork().setGatheringStats(true);
                    if (currWork.getReduceWork() != null) {
                        currWork.getReduceWork().setGatheringStats(true);
                    }
                    // NOTE: here we should use the new partition predicate pushdown API to get a list of
                    // pruned list,
                    // and pass it to setTaskPlan as the last parameter
                    Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(op);
                    if (confirmedPartns.size() > 0) {
                        List<String> partCols = GenMapRedUtils.getPartitionColumns(op);
                        PrunedPartitionList partList = new PrunedPartitionList(table, confirmedPartns, partCols, false);
                        GenMapRedUtils.setTaskPlan(currAliasId, op, currTask, false, ctx, partList);
                    } else {
                        // non-partitioned table
                        GenMapRedUtils.setTaskPlan(currAliasId, op, currTask, false, ctx);
                    }
                }
            }
            return true;
        }
    }
    assert false;
    return null;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapredParquetInputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 5 with BasicStatsWork

use of org.apache.hadoop.hive.ql.plan.BasicStatsWork in project hive by apache.

the class TaskCompiler method genTableStats.

private Task<?> genTableStats(ParseContext parseContext, TableScanOperator tableScan, Task currentTask, final HashSet<WriteEntity> outputs) throws HiveException {
    Class<? extends InputFormat> inputFormat = tableScan.getConf().getTableMetadata().getInputFormatClass();
    Table table = tableScan.getConf().getTableMetadata();
    List<Partition> partitions = new ArrayList<>();
    if (table.isPartitioned()) {
        partitions.addAll(parseContext.getPrunedPartitions(tableScan).getPartitions());
        for (Partition partn : partitions) {
            LOG.trace("adding part: " + partn);
            outputs.add(new WriteEntity(partn, WriteEntity.WriteType.DDL_NO_LOCK));
        }
    }
    TableSpec tableSpec = new TableSpec(table, partitions);
    tableScan.getConf().getTableMetadata().setTableSpec(tableSpec);
    if (inputFormat.equals(OrcInputFormat.class)) {
        // For ORC, there is no Tez Job for table stats.
        StatsWork columnStatsWork = new StatsWork(table, parseContext.getConf());
        columnStatsWork.setFooterScan();
        // If partition is specified, get pruned partition list
        if (partitions.size() > 0) {
            columnStatsWork.addInputPartitions(parseContext.getPrunedPartitions(tableScan).getPartitions());
        }
        return TaskFactory.get(columnStatsWork);
    } else {
        BasicStatsWork statsWork = new BasicStatsWork(tableScan.getConf().getTableMetadata().getTableSpec());
        StatsWork columnStatsWork = new StatsWork(table, statsWork, parseContext.getConf());
        columnStatsWork.collectStatsFromAggregator(tableScan.getConf());
        columnStatsWork.setSourceTask(currentTask);
        return TaskFactory.get(columnStatsWork);
    }
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableSpec(org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec) Table(org.apache.hadoop.hive.ql.metadata.Table) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) ArrayList(java.util.ArrayList) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity)

Aggregations

BasicStatsWork (org.apache.hadoop.hive.ql.plan.BasicStatsWork)8 StatsWork (org.apache.hadoop.hive.ql.plan.StatsWork)8 Partition (org.apache.hadoop.hive.ql.metadata.Partition)7 Table (org.apache.hadoop.hive.ql.metadata.Table)7 OrcInputFormat (org.apache.hadoop.hive.ql.io.orc.OrcInputFormat)4 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)4 MoveWork (org.apache.hadoop.hive.ql.plan.MoveWork)4 Path (org.apache.hadoop.fs.Path)3 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)3 WriteEntity (org.apache.hadoop.hive.ql.hooks.WriteEntity)3 MapredParquetInputFormat (org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat)3 LoadTableDesc (org.apache.hadoop.hive.ql.plan.LoadTableDesc)3 URISyntaxException (java.net.URISyntaxException)2 ArrayList (java.util.ArrayList)2 HashMap (java.util.HashMap)2 LinkedHashMap (java.util.LinkedHashMap)2 FieldSchema (org.apache.hadoop.hive.metastore.api.FieldSchema)2 LockException (org.apache.hadoop.hive.ql.lockmgr.LockException)2 ParseContext (org.apache.hadoop.hive.ql.parse.ParseContext)2 PrunedPartitionList (org.apache.hadoop.hive.ql.parse.PrunedPartitionList)2