Search in sources :

Example 31 with MapredWork

use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.

the class TestCombineHiveInputFormat method testAvoidSplitCombination.

public void testAvoidSplitCombination() throws Exception {
    Configuration conf = new Configuration();
    JobConf job = new JobConf(conf);
    TableDesc tblDesc = Utilities.defaultTd;
    tblDesc.setInputFileFormatClass(TestSkipCombineInputFormat.class);
    PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
    LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>();
    pt.put(new Path("/tmp/testfolder1"), partDesc);
    pt.put(new Path("/tmp/testfolder2"), partDesc);
    MapredWork mrwork = new MapredWork();
    mrwork.getMapWork().setPathToPartitionInfo(pt);
    Path mapWorkPath = new Path("/tmp/" + System.getProperty("user.name"), "hive");
    Utilities.setMapRedWork(conf, mrwork, mapWorkPath);
    try {
        Path[] paths = new Path[2];
        paths[0] = new Path("/tmp/testfolder1");
        paths[1] = new Path("/tmp/testfolder2");
        CombineHiveInputFormat combineInputFormat = ReflectionUtils.newInstance(CombineHiveInputFormat.class, conf);
        combineInputFormat.pathToPartitionInfo = Utilities.getMapWork(conf).getPathToPartitionInfo();
        Set results = combineInputFormat.getNonCombinablePathIndices(job, paths, 2);
        assertEquals("Should have both path indices in the results set", 2, results.size());
    } finally {
        // Cleanup the mapwork path
        FileSystem.get(conf).delete(mapWorkPath, true);
    }
}
Also used : Path(org.apache.hadoop.fs.Path) Set(java.util.Set) Configuration(org.apache.hadoop.conf.Configuration) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) JobConf(org.apache.hadoop.mapred.JobConf) LinkedHashMap(java.util.LinkedHashMap)

Example 32 with MapredWork

use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.

the class TestHiveBinarySearchRecordReader method init.

private void init() throws IOException {
    conf = new JobConf();
    resetIOContext();
    rcfReader = mock(RCFileRecordReader.class);
    when(rcfReader.next((LongWritable) anyObject(), (BytesRefArrayWritable) anyObject())).thenReturn(true);
    // Since the start is 0, and the length is 100, the first call to sync should be with the value
    // 50 so return that for getPos()
    when(rcfReader.getPos()).thenReturn(50L);
    conf.setBoolean("hive.input.format.sorted", true);
    TableDesc tblDesc = Utilities.defaultTd;
    PartitionDesc partDesc = new PartitionDesc(tblDesc, null);
    LinkedHashMap<Path, PartitionDesc> pt = new LinkedHashMap<>();
    pt.put(new Path("/tmp/testfolder"), partDesc);
    MapredWork mrwork = new MapredWork();
    mrwork.getMapWork().setPathToPartitionInfo(pt);
    Utilities.setMapRedWork(conf, mrwork, new Path("/tmp/" + System.getProperty("user.name"), "hive"));
    hiveSplit = new TestHiveInputSplit();
    hbsReader = new TestHiveRecordReader(rcfReader, conf);
    hbsReader.initIOContext(hiveSplit, conf, Class.class, rcfReader);
}
Also used : Path(org.apache.hadoop.fs.Path) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) JobConf(org.apache.hadoop.mapred.JobConf) LinkedHashMap(java.util.LinkedHashMap)

Example 33 with MapredWork

use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.

the class ColumnTruncateTask method execute.

@Override
public /**
 * start a new map-reduce job to do the truncation, almost the same as ExecDriver.
 */
int execute(DriverContext driverContext) {
    HiveConf.setVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT, BucketizedHiveInputFormat.class.getName());
    success = true;
    HiveFileFormatUtils.prepareJobOutput(job);
    job.setOutputFormat(HiveOutputFormatImpl.class);
    job.setMapperClass(work.getMapperClass());
    Context ctx = driverContext.getCtx();
    boolean ctxCreated = false;
    try {
        if (ctx == null) {
            ctx = new Context(job);
            ctxCreated = true;
        }
    } catch (IOException e) {
        e.printStackTrace();
        console.printError("Error launching map-reduce job", "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
        return 5;
    }
    job.setMapOutputKeyClass(NullWritable.class);
    job.setMapOutputValueClass(NullWritable.class);
    if (work.getNumMapTasks() != null) {
        job.setNumMapTasks(work.getNumMapTasks());
    }
    // zero reducers
    job.setNumReduceTasks(0);
    if (work.getMinSplitSize() != null) {
        HiveConf.setLongVar(job, HiveConf.ConfVars.MAPREDMINSPLITSIZE, work.getMinSplitSize().longValue());
    }
    if (work.getInputformat() != null) {
        HiveConf.setVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT, work.getInputformat());
    }
    String inpFormat = HiveConf.getVar(job, HiveConf.ConfVars.HIVEINPUTFORMAT);
    LOG.info("Using " + inpFormat);
    try {
        job.setInputFormat(JavaUtils.loadClass(inpFormat));
    } catch (ClassNotFoundException e) {
        throw new RuntimeException(e.getMessage(), e);
    }
    Path outputPath = this.work.getOutputDir();
    Path tempOutPath = Utilities.toTempPath(outputPath);
    try {
        FileSystem fs = tempOutPath.getFileSystem(job);
        if (!fs.exists(tempOutPath)) {
            fs.mkdirs(tempOutPath);
        }
    } catch (IOException e) {
        console.printError("Can't make path " + outputPath + " : " + e.getMessage());
        return 6;
    }
    job.setOutputKeyClass(NullWritable.class);
    job.setOutputValueClass(NullWritable.class);
    int returnVal = 0;
    RunningJob rj = null;
    boolean noName = StringUtils.isEmpty(job.get(MRJobConfig.JOB_NAME));
    String jobName = null;
    if (noName && this.getQueryPlan() != null) {
        int maxlen = conf.getIntVar(HiveConf.ConfVars.HIVEJOBNAMELENGTH);
        jobName = Utilities.abbreviate(this.getQueryPlan().getQueryStr(), maxlen - 6);
    }
    if (noName) {
        // This is for a special case to ensure unit tests pass
        job.set(MRJobConfig.JOB_NAME, jobName != null ? jobName : "JOB" + Utilities.randGen.nextInt());
    }
    try {
        addInputPaths(job, work);
        MapredWork mrWork = new MapredWork();
        mrWork.setMapWork(work);
        Utilities.setMapRedWork(job, mrWork, ctx.getMRTmpPath());
        // remove the pwd from conf file so that job tracker doesn't show this
        // logs
        String pwd = HiveConf.getVar(job, HiveConf.ConfVars.METASTOREPWD);
        if (pwd != null) {
            HiveConf.setVar(job, HiveConf.ConfVars.METASTOREPWD, "HIVE");
        }
        JobClient jc = new JobClient(job);
        String addedJars = Utilities.getResourceFiles(job, SessionState.ResourceType.JAR);
        if (!addedJars.isEmpty()) {
            job.set("tmpjars", addedJars);
        }
        // make this client wait if job trcker is not behaving well.
        Throttle.checkJobTracker(job, LOG);
        // Finally SUBMIT the JOB!
        rj = jc.submitJob(job);
        this.jobID = rj.getJobID();
        returnVal = jobExecHelper.progress(rj, jc, ctx);
        success = (returnVal == 0);
    } catch (Exception e) {
        e.printStackTrace();
        setException(e);
        String mesg = " with exception '" + Utilities.getNameMessage(e) + "'";
        if (rj != null) {
            mesg = "Ended Job = " + rj.getJobID() + mesg;
        } else {
            mesg = "Job Submission failed" + mesg;
        }
        // Has to use full name to make sure it does not conflict with
        // org.apache.commons.lang.StringUtils
        console.printError(mesg, "\n" + org.apache.hadoop.util.StringUtils.stringifyException(e));
        success = false;
        returnVal = 1;
    } finally {
        try {
            if (ctxCreated) {
                ctx.clear();
            }
            if (rj != null) {
                if (returnVal != 0) {
                    rj.killJob();
                }
            }
            ColumnTruncateMapper.jobClose(outputPath, success, job, console, work.getDynPartCtx(), null);
        } catch (Exception e) {
            LOG.warn("Failed while cleaning up ", e);
        } finally {
            HadoopJobExecHelper.runningJobs.remove(rj);
        }
    }
    return (returnVal);
}
Also used : Context(org.apache.hadoop.hive.ql.Context) DriverContext(org.apache.hadoop.hive.ql.DriverContext) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) Path(org.apache.hadoop.fs.Path) BucketizedHiveInputFormat(org.apache.hadoop.hive.ql.io.BucketizedHiveInputFormat) IOException(java.io.IOException) JobClient(org.apache.hadoop.mapred.JobClient) IOException(java.io.IOException) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) FileSystem(org.apache.hadoop.fs.FileSystem) RunningJob(org.apache.hadoop.mapred.RunningJob)

Example 34 with MapredWork

use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.

the class GenMRTableScan1 method process.

/**
 * Table Sink encountered.
 * @param nd
 *          the table sink operator encountered
 * @param opProcCtx
 *          context
 */
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    TableScanOperator op = (TableScanOperator) nd;
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    ParseContext parseCtx = ctx.getParseCtx();
    Table table = op.getConf().getTableMetadata();
    Class<? extends InputFormat> inputFormat = table.getInputFormatClass();
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
    // create a dummy MapReduce task
    MapredWork currWork = GenMapRedUtils.getMapRedWork(parseCtx);
    MapRedTask currTask = (MapRedTask) TaskFactory.get(currWork);
    ctx.setCurrTask(currTask);
    ctx.setCurrTopOp(op);
    for (String alias : parseCtx.getTopOps().keySet()) {
        Operator<? extends OperatorDesc> currOp = parseCtx.getTopOps().get(alias);
        if (currOp == op) {
            String currAliasId = alias;
            ctx.setCurrAliasId(currAliasId);
            mapCurrCtx.put(op, new GenMapRedCtx(currTask, currAliasId));
            if (parseCtx.getQueryProperties().isAnalyzeCommand()) {
                boolean noScan = parseCtx.getQueryProperties().isNoScanAnalyzeCommand();
                if (OrcInputFormat.class.isAssignableFrom(inputFormat) || MapredParquetInputFormat.class.isAssignableFrom(inputFormat)) {
                    // For ORC and Parquet, all the following statements are the same
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS noscan;
                    // There will not be any MR or Tez job above this task
                    StatsWork statWork = new StatsWork(table, parseCtx.getConf());
                    statWork.setFooterScan();
                    // If partition is specified, get pruned partition list
                    Set<Partition> confirmedParts = GenMapRedUtils.getConfirmedPartitionsForScan(op);
                    if (confirmedParts.size() > 0) {
                        List<String> partCols = GenMapRedUtils.getPartitionColumns(op);
                        PrunedPartitionList partList = new PrunedPartitionList(table, confirmedParts, partCols, false);
                        statWork.addInputPartitions(partList.getPartitions());
                    }
                    Task<StatsWork> snjTask = TaskFactory.get(statWork);
                    ctx.setCurrTask(snjTask);
                    ctx.setCurrTopOp(null);
                    ctx.getRootTasks().clear();
                    ctx.getRootTasks().add(snjTask);
                } else {
                    // ANALYZE TABLE T [PARTITION (...)] COMPUTE STATISTICS;
                    // The plan consists of a simple MapRedTask followed by a StatsTask.
                    // The MR task is just a simple TableScanOperator
                    BasicStatsWork statsWork = new BasicStatsWork(table.getTableSpec());
                    statsWork.setNoScanAnalyzeCommand(noScan);
                    StatsWork columnStatsWork = new StatsWork(table, statsWork, parseCtx.getConf());
                    columnStatsWork.collectStatsFromAggregator(op.getConf());
                    columnStatsWork.setSourceTask(currTask);
                    Task<StatsWork> columnStatsTask = TaskFactory.get(columnStatsWork);
                    currTask.addDependentTask(columnStatsTask);
                    if (!ctx.getRootTasks().contains(currTask)) {
                        ctx.getRootTasks().add(currTask);
                    }
                    // The plan consists of a StatsTask only.
                    if (noScan) {
                        columnStatsTask.setParentTasks(null);
                        ctx.getRootTasks().remove(currTask);
                        ctx.getRootTasks().add(columnStatsTask);
                    }
                    currWork.getMapWork().setGatheringStats(true);
                    if (currWork.getReduceWork() != null) {
                        currWork.getReduceWork().setGatheringStats(true);
                    }
                    // NOTE: here we should use the new partition predicate pushdown API to get a list of
                    // pruned list,
                    // and pass it to setTaskPlan as the last parameter
                    Set<Partition> confirmedPartns = GenMapRedUtils.getConfirmedPartitionsForScan(op);
                    if (confirmedPartns.size() > 0) {
                        List<String> partCols = GenMapRedUtils.getPartitionColumns(op);
                        PrunedPartitionList partList = new PrunedPartitionList(table, confirmedPartns, partCols, false);
                        GenMapRedUtils.setTaskPlan(currAliasId, op, currTask, false, ctx, partList);
                    } else {
                        // non-partitioned table
                        GenMapRedUtils.setTaskPlan(currAliasId, op, currTask, false, ctx);
                    }
                }
            }
            return true;
        }
    }
    assert false;
    return null;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Partition(org.apache.hadoop.hive.ql.metadata.Partition) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Table(org.apache.hadoop.hive.ql.metadata.Table) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapredParquetInputFormat(org.apache.hadoop.hive.ql.io.parquet.MapredParquetInputFormat) PrunedPartitionList(org.apache.hadoop.hive.ql.parse.PrunedPartitionList) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) StatsWork(org.apache.hadoop.hive.ql.plan.StatsWork) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) OrcInputFormat(org.apache.hadoop.hive.ql.io.orc.OrcInputFormat) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) BasicStatsWork(org.apache.hadoop.hive.ql.plan.BasicStatsWork) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 35 with MapredWork

use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.

the class GenMRUnion1 method process.

/**
 * Union Operator encountered . Currently, the algorithm is pretty simple: If
 * all the sub-queries are map-only, don't do anything. Otherwise, insert a
 * FileSink on top of all the sub-queries.
 *
 * This can be optimized later on.
 *
 * @param nd
 *          the file sink operator encountered
 * @param opProcCtx
 *          context
 */
@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx opProcCtx, Object... nodeOutputs) throws SemanticException {
    UnionOperator union = (UnionOperator) nd;
    GenMRProcContext ctx = (GenMRProcContext) opProcCtx;
    ParseContext parseCtx = ctx.getParseCtx();
    UnionProcContext uCtx = parseCtx.getUCtx();
    // Map-only subqueries can be optimized in future to not write to a file in
    // future
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = ctx.getMapCurrCtx();
    if (union.getConf().isAllInputsInSameReducer()) {
        // All inputs of this UnionOperator are in the same Reducer.
        // We do not need to break the operator tree.
        mapCurrCtx.put((Operator<? extends OperatorDesc>) nd, new GenMapRedCtx(ctx.getCurrTask(), ctx.getCurrAliasId()));
        return null;
    }
    UnionParseContext uPrsCtx = uCtx.getUnionParseContext(union);
    ctx.setCurrUnionOp(union);
    // map-reduce job
    if (uPrsCtx.allMapOnlySubQ()) {
        return processMapOnlyUnion(union, stack, ctx, uCtx);
    }
    assert uPrsCtx != null;
    Task<? extends Serializable> currTask = ctx.getCurrTask();
    int pos = UnionProcFactory.getPositionParent(union, stack);
    Task<? extends Serializable> uTask = null;
    MapredWork uPlan = null;
    // union is encountered for the first time
    GenMRUnionCtx uCtxTask = ctx.getUnionTask(union);
    if (uCtxTask == null) {
        uPlan = GenMapRedUtils.getMapRedWork(parseCtx);
        uTask = TaskFactory.get(uPlan);
        uCtxTask = new GenMRUnionCtx(uTask);
        ctx.setUnionTask(union, uCtxTask);
    } else {
        uTask = uCtxTask.getUTask();
    }
    // Copy into the current union task plan if
    if (uPrsCtx.getMapOnlySubq(pos) && uPrsCtx.getRootTask(pos)) {
        processSubQueryUnionMerge(ctx, uCtxTask, union, stack);
        if (ctx.getRootTasks().contains(currTask)) {
            ctx.getRootTasks().remove(currTask);
        }
    } else // If it a map-reduce job, create a temporary file
    {
        // is the current task a root task
        if (shouldBeRootTask(currTask) && !ctx.getRootTasks().contains(currTask) && (currTask.getParentTasks() == null || currTask.getParentTasks().isEmpty())) {
            ctx.getRootTasks().add(currTask);
        }
        processSubQueryUnionCreateIntermediate(union.getParentOperators().get(pos), union, uTask, ctx, uCtxTask);
        // the currAliasId and CurrTopOp is not valid any more
        ctx.setCurrAliasId(null);
        ctx.setCurrTopOp(null);
        ctx.getOpTaskMap().put(null, uTask);
    }
    ctx.setCurrTask(uTask);
    mapCurrCtx.put((Operator<? extends OperatorDesc>) nd, new GenMapRedCtx(ctx.getCurrTask(), null));
    return true;
}
Also used : TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) UnionProcContext(org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext) UnionParseContext(org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext.UnionParseContext) GenMRUnionCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMRUnionCtx) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) UnionParseContext(org.apache.hadoop.hive.ql.optimizer.unionproc.UnionProcContext.UnionParseContext) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Aggregations

MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)44 Path (org.apache.hadoop.fs.Path)17 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)14 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)14 Serializable (java.io.Serializable)13 Operator (org.apache.hadoop.hive.ql.exec.Operator)13 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)13 Task (org.apache.hadoop.hive.ql.exec.Task)12 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)12 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)11 ArrayList (java.util.ArrayList)10 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)10 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)10 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)9 LinkedHashMap (java.util.LinkedHashMap)8 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)8 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)8 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)7 GenMapRedCtx (org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx)7 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)7