Search in sources :

Example 26 with MapredWork

use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.

the class GenMapRedUtils method setUnionPlan.

private static void setUnionPlan(GenMRProcContext opProcCtx, boolean local, Task<? extends Serializable> currTask, GenMRUnionCtx uCtx, boolean mergeTask) throws SemanticException {
    TableScanOperator currTopOp = opProcCtx.getCurrTopOp();
    if (currTopOp != null) {
        String currAliasId = opProcCtx.getCurrAliasId();
        if (mergeTask || !opProcCtx.isSeenOp(currTask, currTopOp)) {
            setTaskPlan(currAliasId, currTopOp, currTask, local, opProcCtx);
        }
        currTopOp = null;
        opProcCtx.setCurrTopOp(currTopOp);
    } else {
        List<String> taskTmpDirLst = uCtx.getTaskTmpDir();
        if ((taskTmpDirLst != null) && !(taskTmpDirLst.isEmpty())) {
            List<TableDesc> tt_descLst = uCtx.getTTDesc();
            assert !taskTmpDirLst.isEmpty() && !tt_descLst.isEmpty();
            assert taskTmpDirLst.size() == tt_descLst.size();
            int size = taskTmpDirLst.size();
            assert local == false;
            List<TableScanOperator> topOperators = uCtx.getListTopOperators();
            MapredWork plan = (MapredWork) currTask.getWork();
            for (int pos = 0; pos < size; pos++) {
                String taskTmpDir = taskTmpDirLst.get(pos);
                TableDesc tt_desc = tt_descLst.get(pos);
                MapWork mWork = plan.getMapWork();
                if (mWork.getPathToAliases().get(taskTmpDir) == null) {
                    taskTmpDir = taskTmpDir.intern();
                    Path taskTmpDirPath = StringInternUtils.internUriStringsInPath(new Path(taskTmpDir));
                    mWork.removePathToAlias(taskTmpDirPath);
                    mWork.addPathToAlias(taskTmpDirPath, taskTmpDir);
                    mWork.addPathToPartitionInfo(taskTmpDirPath, new PartitionDesc(tt_desc, null));
                    mWork.getAliasToWork().put(taskTmpDir, topOperators.get(pos));
                }
            }
        }
    }
}
Also used : Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) LoadTableDesc(org.apache.hadoop.hive.ql.plan.LoadTableDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 27 with MapredWork

use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.

the class GenMRTableScan1 method process.

/**
   * Table Sink encountered.
   * @param nd
   *          the table sink operator encountered
   * @param opProcCtx
   *          context
   */
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    TableScanOperator op = (TableScanOperator) nd;
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    ParseContext parseCtx = ctx.getParseCtx();
    Class<? extends InputFormat> inputFormat = op.getConf().getTableMetadata().getInputFormatClass();
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
    // create a dummy MapReduce task
    MapredWork currWork = GenMapRedUtils.getMapRedWork(parseCtx);
    MapRedTask currTask = (MapRedTask) TaskFactory.get(currWork, parseCtx.getConf());
    ctx.setCurrTask(currTask);
    ctx.setCurrTopOp(op);
    for (String alias : parseCtx.getTopOps().keySet()) {
        Operator<? extends OperatorDesc> currOp = parseCtx.getTopOps().get(alias);
        if (currOp == op) {
            String currAliasId = alias;
            ctx.setCurrAliasId(currAliasId);
            mapCurrCtx.put(op, new GenMapRedCtx(currTask, currAliasId));
            if (parseCtx.getQueryProperties().isAnalyzeCommand()) {
                boolean partialScan = parseCtx.getQueryProperties().isPartialScanAnalyzeCommand();
                boolean noScan = parseCtx.getQueryProperties().isNoScanAnalyzeCommand();
                if (OrcInputFormat.class.isAssignableFrom(inputFormat) || MapredParquetInputFormat.class.isAssignableFrom(inputFormat)) {
                    // For ORC and Parquet, all the following statements are the same
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
                    // There will not be any MR or Tez job above this task
                    StatsNoJobWork snjWork = new StatsNoJobWork(op.getConf().getTableMetadata().getTableSpec());
                    snjWork.setStatsReliable(parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
                    // If partition is specified, get pruned partition list
                    Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(op);
                    if (confirmedParts.size() > 0) {
                        Table source = op.getConf().getTableMetadata();
                        List<String> partCols = GenMapRedUtils.getPartitionColumns(op);
                        PrunedPartitionList partList = new PrunedPartitionList(source, confirmedParts, partCols, false);
                        snjWork.setPrunedPartitionList(partList);
                    }
                    Task<StatsNoJobWork> snjTask = TaskFactory.get(snjWork, parseCtx.getConf());
                    ctx.setCurrTask(snjTask);
                    ctx.setCurrTopOp(null);
                    ctx.getRootTasks().clear();
                    ctx.getRootTasks().add(snjTask);
                } else {
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
                    // The plan consists of a simple MapRedTask followed by a StatsTask.
                    // The MR task is just a simple TableScanOperator
                    StatsWork statsWork = new StatsWork(op.getConf().getTableMetadata().getTableSpec());
                    statsWork.setAggKey(op.getConf().getStatsAggPrefix());
                    statsWork.setStatsTmpDir(op.getConf().getTmpStatsDir());
                    statsWork.setSourceTask(currTask);
                    statsWork.setStatsReliable(parseCtx.getConf().getBoolVar(HiveConf.ConfVars.HIVE_STATS_RELIABLE));
                    Task<StatsWork> statsTask = TaskFactory.get(statsWork, parseCtx.getConf());
                    currTask.addDependentTask(statsTask);
                    if (!ctx.getRootTasks().contains(currTask)) {
                        ctx.getRootTasks().add(currTask);
                    }
                    // The plan consists of a StatsTask only.
                    if (noScan) {
                        statsTask.setParentTasks(null);
                        statsWork.setNoScanAnalyzeCommand(true);
                        ctx.getRootTasks().remove(currTask);
                        ctx.getRootTasks().add(statsTask);
                    }
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS partialscan;
                    if (partialScan) {
                        handlePartialScanCommand(op, ctx, parseCtx, currTask, statsWork, statsTask);
                    }
                    currWork.getMapWork().setGatheringStats(true);
                    if (currWork.getReduceWork() != null) {
                        currWork.getReduceWork().setGatheringStats(true);
                    }
                    // NOTE: here we should use the new partition predicate pushdown API to get a list of
                    // pruned list,
                    // and pass it to setTaskPlan as the last parameter
                    Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(op);
                    if (confirmedPartns.size() > 0) {
                        Table source = op.getConf().getTableMetadata();
                        List<String> partCols = GenMapRedUtils.getPartitionColumns(op);
                        PrunedPartitionList partList = new PrunedPartitionList(source, confirmedPartns, partCols, false);
                        GenMapRedUtils.setTaskPlan(currAliasId, op, currTask, false, ctx, partList);
                    } else {
                        // non-partitioned table
                        GenMapRedUtils.setTaskPlan(currAliasId, op, currTask, false, ctx);
                    }
                }
            }
            return true;
        }
    }
    assert false;
    return null;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapredParquetInputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) StatsNoJobWork(org.apache.hadoop.hive.ql.plan.StatsNoJobWork) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 28 with MapredWork

use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.

the class TestSymlinkTextInputFormat method setUp.

@Override
protected void setUp() throws IOException {
    conf = new Configuration();
    job = new JobConf(conf);
    TableDesc tblDesc = Utilities.defaultTd;
    PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
    LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>();
    pt.put(new Path("/tmp/testfolder"), partDesc);
    MapredWork mrwork = new MapredWork();
    mrwork.getMapWork().setPathToPartitionInfo(pt);
    Utilities.setMapRedWork(job, mrwork, new Path("/tmp/" + System.getProperty("user.name"), "hive"));
    fileSystem = FileSystem.getLocal(conf);
    testDir = new Path(System.getProperty("test.tmp.dir", System.getProperty("user.dir", new File(".").getAbsolutePath())) + "/TestSymlinkTextInputFormat");
    reporter = Reporter.NULL;
    fileSystem.delete(testDir, true);
    dataDir1 = new Path(testDir, "datadir1");
    dataDir2 = new Path(testDir, "datadir2");
    symlinkDir = new Path(testDir, "symlinkdir");
}
Also used : Path(org.apache.hadoop.fs.Path) Configuration(org.apache.hadoop.conf.Configuration) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) JobConf(org.apache.hadoop.mapred.JobConf) File(java.io.File) LinkedHashMap(java.util.LinkedHashMap)

Example 29 with MapredWork

use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.

the class TestCombineHiveInputFormat method testAvoidSplitCombination.

public void testAvoidSplitCombination() throws Exception {
    Configuration conf = new Configuration();
    JobConf job = new JobConf(conf);
    TableDesc tblDesc = Utilities.defaultTd;
    tblDesc.setInputFileFormatClass(TestSkipCombineInputFormat.class);
    PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
    LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>();
    pt.put(new Path("/tmp/testfolder1"), partDesc);
    pt.put(new Path("/tmp/testfolder2"), partDesc);
    MapredWork mrwork = new MapredWork();
    mrwork.getMapWork().setPathToPartitionInfo(pt);
    Path mapWorkPath = new Path("/tmp/" + System.getProperty("user.name"), "hive");
    Utilities.setMapRedWork(conf, mrwork, mapWorkPath);
    try {
        Path[] paths = new Path[2];
        paths[0] = new Path("/tmp/testfolder1");
        paths[1] = new Path("/tmp/testfolder2");
        CombineHiveInputFormat combineInputFormat = ReflectionUtils.newInstance(CombineHiveInputFormat.class, conf);
        combineInputFormat.pathToPartitionInfo = Utilities.getMapWork(conf).getPathToPartitionInfo();
        Set results = combineInputFormat.getNonCombinablePathIndices(job, paths, 2);
        assertEquals("Should have both path indices in the results set", 2, results.size());
    } finally {
        // Cleanup the mapwork path
        FileSystem.get(conf).delete(mapWorkPath, true);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Set(java.util.Set) Configuration(org.apache.hadoop.conf.Configuration) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) JobConf(org.apache.hadoop.mapred.JobConf) LinkedHashMap(java.util.LinkedHashMap)

Example 30 with MapredWork

use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.

the class TestHiveBinarySearchRecordReader method init.

private void init() throws IOException {
    conf = new JobConf();
    resetIOContext();
    rcfReader = mock(RCFileRecordReader.class);
    when(rcfReader.next((LongWritable) anyObject(), (BytesRefArrayWritable) anyObject())).thenReturn(true);
    // Since the start is 0, and the length is 100, the first call to sync should be with the value
    // 50 so return that for getPos()
    when(rcfReader.getPos()).thenReturn(50L);
    conf.setBoolean("hive.input.format.sorted", true);
    TableDesc tblDesc = Utilities.defaultTd;
    PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
    LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>();
    pt.put(new Path("/tmp/testfolder"), partDesc);
    MapredWork mrwork = new MapredWork();
    mrwork.getMapWork().setPathToPartitionInfo(pt);
    Utilities.setMapRedWork(conf, mrwork, new Path("/tmp/" + System.getProperty("user.name"), "hive"));
    hiveSplit = new TestHiveInputSplit();
    hbsReader = new TestHiveRecordReader(rcfReader, conf);
    hbsReader.initIOContext(hiveSplit, conf, Class.class, rcfReader);
}
Also used : Path(org.apache.hadoop.fs.Path) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) JobConf(org.apache.hadoop.mapred.JobConf) LinkedHashMap(java.util.LinkedHashMap)

Aggregations

MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)43 Path (org.apache.hadoop.fs.Path)17 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)13 Serializable (java.io.Serializable)12 Operator (org.apache.hadoop.hive.ql.exec.Operator)12 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)12 Task (org.apache.hadoop.hive.ql.exec.Task)12 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)12 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)12 ArrayList (java.util.ArrayList)10 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)10 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)10 LinkedHashMap (java.util.LinkedHashMap)9 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)9 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)9 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)8 ParseContext (org.apache.hadoop.hive.ql.parse.ParseContext)8 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)7 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)7 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)7