Examples with MapWork - org.apache.hadoop.hive.ql.plan.MapWork

Example 76 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class ProcessAnalyzeTable method process.

@SuppressWarnings("unchecked")
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
    GenTezProcContext context = (GenTezProcContext) procContext;
    TableScanOperator tableScan = (TableScanOperator) nd;
    ParseContext parseContext = context.parseContext;
    Table table = tableScan.getConf().getTableMetadata();
    Class<? extends InputFormat> inputFormat = table.getInputFormatClass();
    if (parseContext.getQueryProperties().isAnalyzeCommand()) {
        assert tableScan.getChildOperators() == null || tableScan.getChildOperators().size() == 0;
        String alias = null;
        for (String a : parseContext.getTopOps().keySet()) {
            if (tableScan == parseContext.getTopOps().get(a)) {
                alias = a;
            }
        }
        assert alias != null;
        TezWork tezWork = context.currentTask.getWork();
        if (OrcInputFormat.class.isAssignableFrom(inputFormat) || MapredParquetInputFormat.class.isAssignableFrom(inputFormat)) {
            // For ORC & Parquet, all the following statements are the same
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
            // There will not be any Tez job above this task
            StatsWork statWork = new StatsWork(table, parseContext.getConf());
            statWork.setFooterScan();
            // If partition is specified, get pruned partition list
            Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
            if (confirmedParts.size() > 0) {
                List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
                PrunedPartitionList partList = new PrunedPartitionList(table, confirmedParts, partCols, false);
                statWork.addInputPartitions(partList.getPartitions());
            }
            Task<StatsWork> snjTask = TaskFactory.get(statWork);
            snjTask.setParentTasks(null);
            context.rootTasks.remove(context.currentTask);
            context.rootTasks.add(snjTask);
            return true;
        } else {
            // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
            // The plan consists of a simple TezTask followed by a StatsTask.
            // The Tez task is just a simple TableScanOperator
            BasicStatsWork basicStatsWork = new BasicStatsWork(table.getTableSpec());
            basicStatsWork.setNoScanAnalyzeCommand(parseContext.getQueryProperties().isNoScanAnalyzeCommand());
            StatsWork columnStatsWork = new StatsWork(table, basicStatsWork, parseContext.getConf());
            columnStatsWork.collectStatsFromAggregator(tableScan.getConf());
            columnStatsWork.setSourceTask(context.currentTask);
            Task<StatsWork> statsTask = TaskFactory.get(columnStatsWork);
            context.currentTask.addDependentTask(statsTask);
            // The plan consists of a StatsTask only.
            if (parseContext.getQueryProperties().isNoScanAnalyzeCommand()) {
                statsTask.setParentTasks(null);
                context.rootTasks.remove(context.currentTask);
                context.rootTasks.add(statsTask);
            }
            // NOTE: here we should use the new partition predicate pushdown API to
            // get a list of pruned list,
            // and pass it to setTaskPlan as the last parameter
            Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(tableScan);
            PrunedPartitionList partitions = null;
            if (confirmedPartns.size() > 0) {
                List<String> partCols = GenMapRedUtils.getPartitionColumns(tableScan);
                partitions = new PrunedPartitionList(table, confirmedPartns, partCols, false);
            }
            MapWork w = utils.createMapWork(context, tableScan, tezWork, partitions);
            w.setGatheringStats(true);
            return true;
        }
    }
    return null;
}

Also used : Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) MapredParquetInputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) TezWork(org.apache.hadoop.hive.ql.plan.TezWork)

Example 77 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class GenSparkUtils method processPartitionPruningSink.

/**
 * Populate partition pruning information from the pruning sink operator to the
 * target MapWork (the MapWork for the big table side). The information include the source table
 * name, column name, and partition key expression. It also set up the temporary path used to
 * communicate between the target MapWork and source BaseWork.
 *
 * Here "source" refers to the small table side, while "target" refers to the big
 * table side.
 *
 * @param context the spark context.
 * @param pruningSink the pruner sink operator being processed.
 */
public void processPartitionPruningSink(GenSparkProcContext context, SparkPartitionPruningSinkOperator pruningSink) {
    SparkPartitionPruningSinkDesc desc = pruningSink.getConf();
    final Path outputBase = getDPPOutputPath(context.parseContext.getContext());
    final String sourceId = pruningSink.getUniqueId();
    desc.setPath(new Path(outputBase, sourceId));
    for (SparkPartitionPruningSinkDesc.DPPTargetInfo targetInfo : desc.getTargetInfos()) {
        TableScanOperator ts = targetInfo.tableScan;
        MapWork targetWork = (MapWork) context.rootToWorkMap.get(ts);
        Preconditions.checkNotNull(targetWork, "No targetWork found for tablescan " + ts);
        // set up temporary path to communicate between the small/big table
        if (targetWork.getTmpPathForPartitionPruning() == null) {
            targetWork.setTmpPathForPartitionPruning(outputBase);
            LOG.info("Setting tmp path between source work and target work:\n" + outputBase);
        }
        targetInfo.work = targetWork;
        targetInfo.columnName = SparkUtilities.getWorkId(targetWork) + ":" + targetInfo.columnName;
        pruningSink.addAsSourceEvent(targetWork, targetInfo.partKey, targetInfo.columnName, targetInfo.columnType);
    }
}

Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) SparkPartitionPruningSinkDesc(org.apache.hadoop.hive.ql.optimizer.spark.SparkPartitionPruningSinkDesc)

Example 78 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class GenSparkUtils method createMapWork.

public MapWork createMapWork(GenSparkProcContext context, Operator<?> root, SparkWork sparkWork, PrunedPartitionList partitions, boolean deferSetup) throws SemanticException {
    Preconditions.checkArgument(root.getParentOperators().isEmpty(), "AssertionError: expected root.getParentOperators() to be empty");
    MapWork mapWork = new MapWork("Map " + (++sequenceNumber));
    LOG.debug("Adding map work (" + mapWork.getName() + ") for " + root);
    // map work starts with table scan operators
    Preconditions.checkArgument(root instanceof TableScanOperator, "AssertionError: expected root to be an instance of TableScanOperator, but was " + root.getClass().getName());
    String alias_id = null;
    if (context.parseContext != null && context.parseContext.getTopOps() != null) {
        for (String currentAliasID : context.parseContext.getTopOps().keySet()) {
            Operator<? extends OperatorDesc> currOp = context.parseContext.getTopOps().get(currentAliasID);
            if (currOp == root) {
                alias_id = currentAliasID;
                break;
            }
        }
    }
    if (alias_id == null)
        alias_id = ((TableScanOperator) root).getConf().getAlias();
    if (!deferSetup) {
        setupMapWork(mapWork, context, partitions, (TableScanOperator) root, alias_id);
    }
    // add new item to the Spark work
    sparkWork.add(mapWork);
    return mapWork;
}

Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) MapWork(org.apache.hadoop.hive.ql.plan.MapWork)

Example 79 with MapWork

use of org.apache.hadoop.hive.ql.plan.MapWork in project hive by apache.

the class TestInputOutputFormat method createMockExecutionEnvironment.

/**
 * Create a mock execution environment that has enough detail that
 * ORC, vectorization, HiveInputFormat, and CombineHiveInputFormat don't
 * explode.
 * @param workDir a local filesystem work directory
 * @param warehouseDir a mock filesystem warehouse directory
 * @param tableName the table name
 * @param objectInspector object inspector for the row
 * @param isVectorized should run vectorized
 * @return a JobConf that contains the necessary information
 * @throws IOException
 * @throws HiveException
 */
JobConf createMockExecutionEnvironment(Path workDir, Path warehouseDir, String tableName, ObjectInspector objectInspector, boolean isVectorized, int partitions) throws IOException, HiveException {
    JobConf conf = new JobConf();
    Utilities.clearWorkMap(conf);
    conf.set("hive.exec.plan", workDir.toString());
    conf.set("mapred.job.tracker", "local");
    String isVectorizedString = Boolean.toString(isVectorized);
    conf.set("hive.vectorized.execution.enabled", isVectorizedString);
    conf.set(Utilities.VECTOR_MODE, isVectorizedString);
    conf.set(Utilities.USE_VECTORIZED_INPUT_FILE_FORMAT, isVectorizedString);
    conf.set("fs.mock.impl", MockFileSystem.class.getName());
    conf.set("mapred.mapper.class", ExecMapper.class.getName());
    Path root = new Path(warehouseDir, tableName);
    // clean out previous contents
    ((MockFileSystem) root.getFileSystem(conf)).clear();
    // build partition strings
    String[] partPath = new String[partitions];
    StringBuilder buffer = new StringBuilder();
    for (int p = 0; p < partitions; ++p) {
        partPath[p] = new Path(root, "p=" + p).toString();
        if (p != 0) {
            buffer.append(',');
        }
        buffer.append(partPath[p]);
    }
    conf.set("mapred.input.dir", buffer.toString());
    StringBuilder columnIds = new StringBuilder();
    StringBuilder columnNames = new StringBuilder();
    StringBuilder columnTypes = new StringBuilder();
    StructObjectInspector structOI = (StructObjectInspector) objectInspector;
    List<? extends StructField> fields = structOI.getAllStructFieldRefs();
    int numCols = fields.size();
    for (int i = 0; i < numCols; ++i) {
        if (i != 0) {
            columnIds.append(',');
            columnNames.append(',');
            columnTypes.append(',');
        }
        columnIds.append(i);
        columnNames.append(fields.get(i).getFieldName());
        columnTypes.append(fields.get(i).getFieldObjectInspector().getTypeName());
    }
    conf.set("hive.io.file.readcolumn.ids", columnIds.toString());
    conf.set("partition_columns", "p");
    conf.set(serdeConstants.LIST_COLUMNS, columnNames.toString());
    conf.set(serdeConstants.LIST_COLUMN_TYPES, columnTypes.toString());
    MockFileSystem fs = (MockFileSystem) warehouseDir.getFileSystem(conf);
    fs.clear();
    Properties tblProps = new Properties();
    tblProps.put("name", tableName);
    tblProps.put("serialization.lib", OrcSerde.class.getName());
    tblProps.put("columns", columnNames.toString());
    tblProps.put("columns.types", columnTypes.toString());
    TableDesc tbl = new TableDesc(OrcInputFormat.class, OrcOutputFormat.class, tblProps);
    MapWork mapWork = new MapWork();
    mapWork.setVectorMode(isVectorized);
    if (isVectorized) {
        VectorizedRowBatchCtx vectorizedRowBatchCtx = new VectorizedRowBatchCtx();
        vectorizedRowBatchCtx.init(structOI, new String[0]);
        mapWork.setVectorizedRowBatchCtx(vectorizedRowBatchCtx);
    }
    mapWork.setUseBucketizedHiveInputFormat(false);
    LinkedHashMap<Path, ArrayList<String>> aliasMap = new LinkedHashMap<>();
    ArrayList<String> aliases = new ArrayList<String>();
    aliases.add(tableName);
    LinkedHashMap<Path, PartitionDesc> partMap = new LinkedHashMap<>();
    for (int p = 0; p < partitions; ++p) {
        Path path = new Path(partPath[p]);
        aliasMap.put(path, aliases);
        LinkedHashMap<String, String> partSpec = new LinkedHashMap<String, String>();
        PartitionDesc part = new PartitionDesc(tbl, partSpec);
        if (isVectorized) {
            part.setVectorPartitionDesc(VectorPartitionDesc.createVectorizedInputFileFormat("MockInputFileFormatClassName", false));
        }
        partMap.put(path, part);
    }
    mapWork.setPathToAliases(aliasMap);
    mapWork.setPathToPartitionInfo(partMap);
    // write the plan out
    FileSystem localFs = FileSystem.getLocal(conf).getRaw();
    Path mapXml = new Path(workDir, "map.xml");
    localFs.delete(mapXml, true);
    FSDataOutputStream planStream = localFs.create(mapXml);
    SerializationUtilities.serializePlan(mapWork, planStream);
    conf.setBoolean(Utilities.HAS_MAP_WORK, true);
    planStream.close();
    return conf;
}

Also used : ArrayList(java.util.ArrayList) Properties(java.util.Properties) LinkedHashMap(java.util.LinkedHashMap) VectorizedRowBatchCtx(org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatchCtx) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) VectorPartitionDesc(org.apache.hadoop.hive.ql.plan.VectorPartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) JobConf(org.apache.hadoop.mapred.JobConf) ExecMapper(org.apache.hadoop.hive.ql.exec.mr.ExecMapper) StructObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.StructObjectInspector)

Aggregations

MapWork (org.apache.hadoop.hive.ql.plan.MapWork)79 ArrayList (java.util.ArrayList)25 Path (org.apache.hadoop.fs.Path)24 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)23 Operator (org.apache.hadoop.hive.ql.exec.Operator)21 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)17 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)16 JobConf (org.apache.hadoop.mapred.JobConf)15 Test (org.junit.Test)15 BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)14 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)14 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)13 Serializable (java.io.Serializable)12 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)12 Task (org.apache.hadoop.hive.ql.exec.Task)12 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)12 Context (org.apache.hadoop.hive.ql.Context)11 LinkedHashMap (java.util.LinkedHashMap)10 FileSystem (org.apache.hadoop.fs.FileSystem)10 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)10