Search in sources :

Example 1 with StatsNoJobWork

use of org.apache.hadoop.hive.ql.plan.StatsNoJobWork in project hive by apache.

the class SparkProcessAnalyzeTable method process.

@SuppressWarnings("unchecked")
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
    GenSparkProcContext context = (GenSparkProcContext) procContext;
    TableScanOperator tableScan = (TableScanOperator) nd;
    ParseContext parseContext = context.parseContext;
    @SuppressWarnings("rawtypes") Class<? extends InputFormat> inputFormat = tableScan.getConf().getTableMetadata().getInputFormatClass();
    if (parseContext.getQueryProperties().isAnalyzeCommand()) {
        Preconditions.checkArgument(tableScan.getChildOperators() == null || tableScan.getChildOperators().size() == 0, "AssertionError: expected tableScan.getChildOperators() to be null, " + "or tableScan.getChildOperators().size() to be 0");
        String alias = null;
        for (String a : parseContext.getTopOps().keySet()) {
            if (tableScan == parseContext.getTopOps().get(a)) {
                alias = a;
            }
        }
        Preconditions.checkArgument(alias != null, "AssertionError: expected alias to be not null");
        SparkWork sparkWork = context.currentTask.getWork();
        boolean partialScan = parseContext.getQueryProperties().isPartialScanAnalyzeCommand();
        boolean noScan = parseContext.getQueryProperties().isNoScanAnalyzeCommand();
        if (inputFormat.equals(OrcInputFormat.class) && (noScan || partialScan)) {
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
            // There will not be any Spark job above this task
            StatsNoJobWork snjWork = new StatsNoJobWork(tableScan.getConf().getTableMetadata().getTableSpec());
            snjWork.setStatsReliable(parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
            Task<StatsNoJobWork> snjTask = TaskFactory.get(snjWork, parseContext.getConf());
            snjTask.setParentTasks(null);
            context.rootTasks.remove(context.currentTask);
            context.rootTasks.add(snjTask);
            return true;
        } else {
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
            // The plan consists of a simple SparkTask followed by a StatsTask.
            // The Spark task is just a simple TableScanOperator
            StatsWork statsWork = new StatsWork(tableScan.getConf().getTableMetadata().getTableSpec());
            statsWork.setAggKey(tableScan.getConf().getStatsAggPrefix());
            statsWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir());
            statsWork.setSourceTask(context.currentTask);
            statsWork.setStatsReliable(parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
            Task<StatsWork> statsTask = TaskFactory.get(statsWork, parseContext.getConf());
            context.currentTask.addDependentTask(statsTask);
            // The plan consists of a StatsTask only.
            if (parseContext.getQueryProperties().isNoScanAnalyzeCommand()) {
                statsTask.setParentTasks(null);
                statsWork.setNoScanAnalyzeCommand(true);
                context.rootTasks.remove(context.currentTask);
                context.rootTasks.add(statsTask);
            }
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
            if (parseContext.getQueryProperties().isPartialScanAnalyzeCommand()) {
                handlePartialScanCommand(tableScan, parseContext, statsWork, context, statsTask);
            }
            // NOTE: here we should use the new partition predicate pushdown API to get a list of pruned list,
            // and pass it to setTaskPlan as the last parameter
            Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
            PrunedPartitionList partitions = null;
            if (confirmedPartns.size() > 0) {
                Table source = tableScan.getConf().getTableMetadata();
                List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
                partitions = new PrunedPartitionList(source, confirmedPartns, partCols, false);
            }
            MapWork w = utils.createMapWork(context, tableScan, sparkWork, partitions);
            w.setGatheringStats(true);
            return true;
        }
    }
    return null;
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) StatsNoJobWork(org.apache.hadoop.hive.ql.plan.StatsNoJobWork)

Example 2 with StatsNoJobWork

use of org.apache.hadoop.hive.ql.plan.StatsNoJobWork in project hive by apache.

the class GenMRTableScan1 method process.

/**
   * Table Sink encountered.
   * @param nd
   *          the table sink operator encountered
   * @param opProcCtx
   *          context
   */
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    TableScanOperator op = (TableScanOperator) nd;
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    ParseContext parseCtx = ctx.getParseCtx();
    Class<? extends InputFormat> inputFormat = op.getConf().getTableMetadata().getInputFormatClass();
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
    // create a dummy MapReduce task
    MapredWork currWork = GenMapRedUtils.getMapRedWork(parseCtx);
    MapRedTask currTask = (MapRedTask) TaskFactory.get(currWork, parseCtx.getConf());
    ctx.setCurrTask(currTask);
    ctx.setCurrTopOp(op);
    for (String alias : parseCtx.getTopOps().keySet()) {
        Operator<? extends OperatorDesc> currOp = parseCtx.getTopOps().get(alias);
        if (currOp == op) {
            String currAliasId = alias;
            ctx.setCurrAliasId(currAliasId);
            mapCurrCtx.put(op, new GenMapRedCtx(currTask, currAliasId));
            if (parseCtx.getQueryProperties().isAnalyzeCommand()) {
                boolean partialScan = parseCtx.getQueryProperties().isPartialScanAnalyzeCommand();
                boolean noScan = parseCtx.getQueryProperties().isNoScanAnalyzeCommand();
                if (OrcInputFormat.class.isAssignableFrom(inputFormat) || MapredParquetInputFormat.class.isAssignableFrom(inputFormat)) {
                    // For ORC and Parquet, all the following statements are the same
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
                    // There will not be any MR or Tez job above this task
                    StatsNoJobWork snjWork = new StatsNoJobWork(op.getConf().getTableMetadata().getTableSpec());
                    snjWork.setStatsReliable(parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
                    // If partition is specified, get pruned partition list
                    Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(op);
                    if (confirmedParts.size() > 0) {
                        Table source = op.getConf().getTableMetadata();
                        List<String> partCols = GenMapRedUtils.getPartitionColumns(op);
                        PrunedPartitionList partList = new PrunedPartitionList(source, confirmedParts, partCols, false);
                        snjWork.setPrunedPartitionList(partList);
                    }
                    Task<StatsNoJobWork> snjTask = TaskFactory.get(snjWork, parseCtx.getConf());
                    ctx.setCurrTask(snjTask);
                    ctx.setCurrTopOp(null);
                    ctx.getRootTasks().clear();
                    ctx.getRootTasks().add(snjTask);
                } else {
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
                    // The plan consists of a simple MapRedTask followed by a StatsTask.
                    // The MR task is just a simple TableScanOperator
                    StatsWork statsWork = new StatsWork(op.getConf().getTableMetadata().getTableSpec());
                    statsWork.setAggKey(op.getConf().getStatsAggPrefix());
                    statsWork.setStatsTmpDir(op.getConf().getTmpStatsDir());
                    statsWork.setSourceTask(currTask);
                    statsWork.setStatsReliable(parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
                    Task<StatsWork> statsTask = TaskFactory.get(statsWork, parseCtx.getConf());
                    currTask.addDependentTask(statsTask);
                    if (!ctx.getRootTasks().contains(currTask)) {
                        ctx.getRootTasks().add(currTask);
                    }
                    // The plan consists of a StatsTask only.
                    if (noScan) {
                        statsTask.setParentTasks(null);
                        statsWork.setNoScanAnalyzeCommand(true);
                        ctx.getRootTasks().remove(currTask);
                        ctx.getRootTasks().add(statsTask);
                    }
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
                    if (partialScan) {
                        handlePartialScanCommand(op, ctx, parseCtx, currTask, statsWork, statsTask);
                    }
                    currWork.getMapWork().setGatheringStats(true);
                    if (currWork.getReduceWork() != null) {
                        currWork.getReduceWork().setGatheringStats(true);
                    }
                    // NOTE: here we should use the new partition predicate pushdown API to get a list of
                    // pruned list,
                    // and pass it to setTaskPlan as the last parameter
                    Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(op);
                    if (confirmedPartns.size() > 0) {
                        Table source = op.getConf().getTableMetadata();
                        List<String> partCols = GenMapRedUtils.getPartitionColumns(op);
                        PrunedPartitionList partList = new PrunedPartitionList(source, confirmedPartns, partCols, false);
                        GenMapRedUtils.setTaskPlan(currAliasId, op, currTask, false, ctx, partList);
                    } else {
                        // non-partitioned table
                        GenMapRedUtils.setTaskPlan(currAliasId, op, currTask, false, ctx);
                    }
                }
            }
            return true;
        }
    }
    assert false;
    return null;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapredParquetInputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) StatsNoJobWork(org.apache.hadoop.hive.ql.plan.StatsNoJobWork) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 3 with StatsNoJobWork

use of org.apache.hadoop.hive.ql.plan.StatsNoJobWork in project hive by apache.

the class ProcessAnalyzeTable method process.

@SuppressWarnings("unchecked")
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
    GenTezProcContext context = (GenTezProcContext) procContext;
    TableScanOperator tableScan = (TableScanOperator) nd;
    ParseContext parseContext = context.parseContext;
    Class<? extends InputFormat> inputFormat = tableScan.getConf().getTableMetadata().getInputFormatClass();
    if (parseContext.getQueryProperties().isAnalyzeCommand()) {
        assert tableScan.getChildOperators() == null || tableScan.getChildOperators().size() == 0;
        String alias = null;
        for (String a : parseContext.getTopOps().keySet()) {
            if (tableScan == parseContext.getTopOps().get(a)) {
                alias = a;
            }
        }
        assert alias != null;
        TezWork tezWork = context.currentTask.getWork();
        if (inputFormat.equals(OrcInputFormat.class)) {
            // For ORC, all the following statements are the same
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
            // There will not be any Tez job above this task
            StatsNoJobWork snjWork = new StatsNoJobWork(tableScan.getConf().getTableMetadata().getTableSpec());
            snjWork.setStatsReliable(parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
            // If partition is specified, get pruned partition list
            Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
            if (confirmedParts.size() > 0) {
                Table source = tableScan.getConf().getTableMetadata();
                List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
                PrunedPartitionList partList = new PrunedPartitionList(source, confirmedParts, partCols, false);
                snjWork.setPrunedPartitionList(partList);
            }
            Task<StatsNoJobWork> snjTask = TaskFactory.get(snjWork, parseContext.getConf());
            snjTask.setParentTasks(null);
            context.rootTasks.remove(context.currentTask);
            context.rootTasks.add(snjTask);
            return true;
        } else {
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
            // The plan consists of a simple TezTask followed by a StatsTask.
            // The Tez task is just a simple TableScanOperator
            StatsWork statsWork = new StatsWork(tableScan.getConf().getTableMetadata().getTableSpec());
            statsWork.setAggKey(tableScan.getConf().getStatsAggPrefix());
            statsWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir());
            statsWork.setSourceTask(context.currentTask);
            statsWork.setStatsReliable(parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
            Task<StatsWork> statsTask = TaskFactory.get(statsWork, parseContext.getConf());
            context.currentTask.addDependentTask(statsTask);
            // The plan consists of a StatsTask only.
            if (parseContext.getQueryProperties().isNoScanAnalyzeCommand()) {
                statsTask.setParentTasks(null);
                statsWork.setNoScanAnalyzeCommand(true);
                context.rootTasks.remove(context.currentTask);
                context.rootTasks.add(statsTask);
            }
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
            if (parseContext.getQueryProperties().isPartialScanAnalyzeCommand()) {
                handlePartialScanCommand(tableScan, parseContext, statsWork, context, statsTask);
            }
            // NOTE: here we should use the new partition predicate pushdown API to
            // get a list of pruned list,
            // and pass it to setTaskPlan as the last parameter
            Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
            PrunedPartitionList partitions = null;
            if (confirmedPartns.size() > 0) {
                Table source = tableScan.getConf().getTableMetadata();
                List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
                partitions = new PrunedPartitionList(source, confirmedPartns, partCols, false);
            }
            MapWork w = utils.createMapWork(context, tableScan, tezWork, partitions);
            w.setGatheringStats(true);
            return true;
        }
    } else if (parseContext.getAnalyzeRewrite() != null) {
        // we need to collect table stats while collecting column stats.
        try {
            context.currentTask.addDependentTask(genTableStats(context, tableScan));
        } catch (HiveException e) {
            throw new SemanticException(e);
        }
    }
    return null;
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) StatsNoJobWork(org.apache.hadoop.hive.ql.plan.StatsNoJobWork) TezWork(org.apache.hadoop.hive.ql.plan.TezWork)

Example 4 with StatsNoJobWork

use of org.apache.hadoop.hive.ql.plan.StatsNoJobWork in project hive by apache.

the class ProcessAnalyzeTable method genTableStats.

private Task<?> genTableStats(GenTezProcContext context, TableScanOperator tableScan) throws HiveException {
    Class<? extends InputFormat> inputFormat = tableScan.getConf().getTableMetadata().getInputFormatClass();
    ParseContext parseContext = context.parseContext;
    Table table = tableScan.getConf().getTableMetadata();
    List<Partition> partitions = new ArrayList<>();
    if (table.isPartitioned()) {
        partitions.addAll(parseContext.getPrunedPartitions(tableScan).getPartitions());
        for (Partition partn : partitions) {
            LOG.debug("XXX: adding part: " + partn);
            context.outputs.add(new WriteEntity(partn, WriteEntity.WriteType.DDL_NO_LOCK));
        }
    }
    TableSpec tableSpec = new TableSpec(table, partitions);
    tableScan.getConf().getTableMetadata().setTableSpec(tableSpec);
    if (inputFormat.equals(OrcInputFormat.class)) {
        // For ORC, there is no Tez Job for table stats.
        StatsNoJobWork snjWork = new StatsNoJobWork(tableScan.getConf().getTableMetadata().getTableSpec());
        snjWork.setStatsReliable(parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
        // If partition is specified, get pruned partition list
        if (partitions.size() > 0) {
            snjWork.setPrunedPartitionList(parseContext.getPrunedPartitions(tableScan));
        }
        return TaskFactory.get(snjWork, parseContext.getConf());
    } else {
        StatsWork statsWork = new StatsWork(tableScan.getConf().getTableMetadata().getTableSpec());
        statsWork.setAggKey(tableScan.getConf().getStatsAggPrefix());
        statsWork.setStatsTmpDir(tableScan.getConf().getTmpStatsDir());
        statsWork.setSourceTask(context.currentTask);
        statsWork.setStatsReliable(parseContext.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
        return TaskFactory.get(statsWork, parseContext.getConf());
    }
}
Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableSpec(org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec) Table(org.apache.hadoop.hive.ql.metadata.Table) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) ArrayList(java.util.ArrayList) StatsNoJobWork(org.apache.hadoop.hive.ql.plan.StatsNoJobWork) WriteEntity(org.apache.hadoop.hive.ql.hooks.WriteEntity)

Aggregations

Partition (org.apache.hadoop.hive.ql.metadata.Partition)4 Table (org.apache.hadoop.hive.ql.metadata.Table)4 StatsNoJobWork (org.apache.hadoop.hive.ql.plan.StatsNoJobWork)4 StatsWork (org.apache.hadoop.hive.ql.plan.StatsWork)4 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)3 OrcInputFormat (org.apache.hadoop.hive.ql.io.orc.OrcInputFormat)2 ParseContext (org.apache.hadoop.hive.ql.parse.ParseContext)2 PrunedPartitionList (org.apache.hadoop.hive.ql.parse.PrunedPartitionList)2 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)2 ArrayList (java.util.ArrayList)1 Operator (org.apache.hadoop.hive.ql.exec.Operator)1 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)1 WriteEntity (org.apache.hadoop.hive.ql.hooks.WriteEntity)1 MapredParquetInputFormat (org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat)1 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)1 GenMapRedCtx (org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx)1 TableSpec (org.apache.hadoop.hive.ql.parse.BaseSemanticAnalyzer.TableSpec)1 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)1 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)1 SparkWork (org.apache.hadoop.hive.ql.plan.SparkWork)1