Search in sources :

Example 6 with ReduceWork

use of org.apache.hadoop.hive.ql.plan.ReduceWork in project hive by apache.

the class TestGenTezWork method testCreateReduce.

@Test
public void testCreateReduce() throws SemanticException {
    // create map
    proc.process(rs, null, ctx, (Object[]) null);
    // create reduce
    proc.process(fs, null, ctx, (Object[]) null);
    TezWork work = ctx.currentTask.getWork();
    assertEquals(work.getAllWork().size(), 2);
    BaseWork w = work.getAllWork().get(1);
    assertTrue(w instanceof ReduceWork);
    assertTrue(work.getParents(w).contains(work.getAllWork().get(0)));
    ReduceWork rw = (ReduceWork) w;
    // need to make sure names are set for tez to connect things right
    assertNotNull(w.getName());
    // map work should start with our ts op
    assertSame(rw.getReducer(), fs);
    // should have severed the ties
    assertEquals(fs.getParentOperators().size(), 0);
}
Also used : ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) TezWork(org.apache.hadoop.hive.ql.plan.TezWork) Test(org.junit.Test)

Example 7 with ReduceWork

use of org.apache.hadoop.hive.ql.plan.ReduceWork in project hive by apache.

the class ExecReducer method configure.

@Override
public void configure(JobConf job) {
    rowObjectInspector = new ObjectInspector[Byte.MAX_VALUE];
    ObjectInspector[] valueObjectInspector = new ObjectInspector[Byte.MAX_VALUE];
    ObjectInspector keyObjectInspector;
    if (isInfoEnabled) {
        try {
            LOG.info("conf classpath = " + Arrays.asList(((URLClassLoader) job.getClassLoader()).getURLs()));
            LOG.info("thread classpath = " + Arrays.asList(((URLClassLoader) Thread.currentThread().getContextClassLoader()).getURLs()));
        } catch (Exception e) {
            LOG.info("cannot get classpath: " + e.getMessage());
        }
    }
    jc = job;
    ReduceWork gWork = Utilities.getReduceWork(job);
    reducer = gWork.getReducer();
    // clear out any parents as reducer is the
    reducer.setParentOperators(null);
    // root
    isTagged = gWork.getNeedsTagging();
    try {
        keyTableDesc = gWork.getKeyDesc();
        inputKeyDeserializer = ReflectionUtils.newInstance(keyTableDesc.getDeserializerClass(), null);
        SerDeUtils.initializeSerDe(inputKeyDeserializer, null, keyTableDesc.getProperties(), null);
        keyObjectInspector = inputKeyDeserializer.getObjectInspector();
        valueTableDesc = new TableDesc[gWork.getTagToValueDesc().size()];
        for (int tag = 0; tag < gWork.getTagToValueDesc().size(); tag++) {
            // We should initialize the SerDe with the TypeInfo when available.
            valueTableDesc[tag] = gWork.getTagToValueDesc().get(tag);
            inputValueDeserializer[tag] = ReflectionUtils.newInstance(valueTableDesc[tag].getDeserializerClass(), null);
            SerDeUtils.initializeSerDe(inputValueDeserializer[tag], null, valueTableDesc[tag].getProperties(), null);
            valueObjectInspector[tag] = inputValueDeserializer[tag].getObjectInspector();
            ArrayList<ObjectInspector> ois = new ArrayList<ObjectInspector>();
            ois.add(keyObjectInspector);
            ois.add(valueObjectInspector[tag]);
            rowObjectInspector[tag] = ObjectInspectorFactory.getStandardStructObjectInspector(Utilities.reduceFieldNameList, ois);
        }
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
    MapredContext.init(false, new JobConf(jc));
    // initialize reduce operator tree
    try {
        LOG.info(reducer.dump(0));
        reducer.initialize(jc, rowObjectInspector);
    } catch (Throwable e) {
        abort = true;
        if (e instanceof OutOfMemoryError) {
            // Don't create a new object if we are already out of memory
            throw (OutOfMemoryError) e;
        } else {
            throw new RuntimeException("Reduce operator initialization failed", e);
        }
    }
}
Also used : ObjectInspector(org.apache.hadoop.hive.serde2.objectinspector.ObjectInspector) ArrayList(java.util.ArrayList) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) JobConf(org.apache.hadoop.mapred.JobConf) IOException(java.io.IOException) SerDeException(org.apache.hadoop.hive.serde2.SerDeException) HiveException(org.apache.hadoop.hive.ql.metadata.HiveException)

Example 8 with ReduceWork

use of org.apache.hadoop.hive.ql.plan.ReduceWork in project hive by apache.

the class SparkPlanGenerator method generate.

private SparkTran generate(BaseWork work, SparkWork sparkWork) throws Exception {
    initStatsPublisher(work);
    JobConf newJobConf = cloneJobConf(work);
    checkSpecs(work, newJobConf);
    byte[] confBytes = KryoSerializer.serializeJobConf(newJobConf);
    boolean caching = isCachingWork(work, sparkWork);
    if (work instanceof MapWork) {
        // Create tmp dir for MergeFileWork
        if (work instanceof MergeFileWork) {
            Path outputPath = ((MergeFileWork) work).getOutputDir();
            Path tempOutPath = Utilities.toTempPath(outputPath);
            FileSystem fs = outputPath.getFileSystem(jobConf);
            try {
                if (!fs.exists(tempOutPath)) {
                    fs.mkdirs(tempOutPath);
                }
            } catch (IOException e) {
                throw new RuntimeException("Can't make path " + outputPath + " : " + e.getMessage());
            }
        }
        MapTran mapTran = new MapTran(caching);
        HiveMapFunction mapFunc = new HiveMapFunction(confBytes, sparkReporter);
        mapTran.setMapFunction(mapFunc);
        return mapTran;
    } else if (work instanceof ReduceWork) {
        ReduceTran reduceTran = new ReduceTran(caching);
        HiveReduceFunction reduceFunc = new HiveReduceFunction(confBytes, sparkReporter);
        reduceTran.setReduceFunction(reduceFunc);
        return reduceTran;
    } else {
        throw new IllegalStateException("AssertionError: expected either MapWork or ReduceWork, " + "but found " + work.getClass().getName());
    }
}
Also used : Path(org.apache.hadoop.fs.Path) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) IOException(java.io.IOException) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) FileSystem(org.apache.hadoop.fs.FileSystem) JobConf(org.apache.hadoop.mapred.JobConf)

Example 9 with ReduceWork

use of org.apache.hadoop.hive.ql.plan.ReduceWork in project hive by apache.

the class SparkPlanGenerator method cloneJobConf.

@SuppressWarnings({ "unchecked" })
private JobConf cloneJobConf(BaseWork work) throws Exception {
    if (workToJobConf.containsKey(work)) {
        return workToJobConf.get(work);
    }
    JobConf cloned = new JobConf(jobConf);
    // Make sure we'll use a different plan path from the original one
    HiveConf.setVar(cloned, HiveConf.ConfVars.PLAN, "");
    try {
        cloned.setPartitionerClass(JavaUtils.loadClass(HiveConf.getVar(cloned, HiveConf.ConfVars.HIVEPARTITIONER)));
    } catch (ClassNotFoundException e) {
        String msg = "Could not find partitioner class: " + e.getMessage() + " which is specified by: " + HiveConf.ConfVars.HIVEPARTITIONER.varname;
        throw new IllegalArgumentException(msg, e);
    }
    if (work instanceof MapWork) {
        cloned.setBoolean("mapred.task.is.map", true);
        List<Path> inputPaths = Utilities.getInputPaths(cloned, (MapWork) work, scratchDir, context, false);
        Utilities.setInputPaths(cloned, inputPaths);
        Utilities.setMapWork(cloned, (MapWork) work, scratchDir, false);
        Utilities.createTmpDirs(cloned, (MapWork) work);
        if (work instanceof MergeFileWork) {
            MergeFileWork mergeFileWork = (MergeFileWork) work;
            cloned.set(Utilities.MAPRED_MAPPER_CLASS, MergeFileMapper.class.getName());
            cloned.set("mapred.input.format.class", mergeFileWork.getInputformat());
            cloned.setClass("mapred.output.format.class", MergeFileOutputFormat.class, FileOutputFormat.class);
        } else {
            cloned.set(Utilities.MAPRED_MAPPER_CLASS, ExecMapper.class.getName());
        }
        if (((MapWork) work).getMinSplitSize() != null) {
            HiveConf.setLongVar(cloned, HiveConf.ConfVars.MAPREDMINSPLITSIZE, ((MapWork) work).getMinSplitSize());
        }
        // remember the JobConf cloned for each MapWork, so we won't clone for it again
        workToJobConf.put(work, cloned);
    } else if (work instanceof ReduceWork) {
        cloned.setBoolean("mapred.task.is.map", false);
        Utilities.setReduceWork(cloned, (ReduceWork) work, scratchDir, false);
        Utilities.createTmpDirs(cloned, (ReduceWork) work);
        cloned.set(Utilities.MAPRED_REDUCER_CLASS, ExecReducer.class.getName());
    }
    return cloned;
}
Also used : Path(org.apache.hadoop.fs.Path) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) MergeFileMapper(org.apache.hadoop.hive.ql.io.merge.MergeFileMapper) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) JobConf(org.apache.hadoop.mapred.JobConf) ExecMapper(org.apache.hadoop.hive.ql.exec.mr.ExecMapper)

Example 10 with ReduceWork

use of org.apache.hadoop.hive.ql.plan.ReduceWork in project hive by apache.

the class GenMapRedUtils method initPlan.

/**
   * Initialize the current plan by adding it to root tasks.
   *
   * @param op
   *          the reduce sink operator encountered
   * @param opProcCtx
   *          processing context
   */
public static void initPlan(ReduceSinkOperator op, GenMRProcContext opProcCtx) throws SemanticException {
    Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
    GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0));
    Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
    MapredWork plan = (MapredWork) currTask.getWork();
    HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = opProcCtx.getOpTaskMap();
    TableScanOperator currTopOp = opProcCtx.getCurrTopOp();
    opTaskMap.put(reducer, currTask);
    plan.setReduceWork(new ReduceWork());
    plan.getReduceWork().setReducer(reducer);
    ReduceSinkDesc desc = op.getConf();
    plan.getReduceWork().setNumReduceTasks(desc.getNumReducers());
    if (needsTagging(plan.getReduceWork())) {
        plan.getReduceWork().setNeedsTagging(true);
    }
    assert currTopOp != null;
    String currAliasId = opProcCtx.getCurrAliasId();
    if (!opProcCtx.isSeenOp(currTask, currTopOp)) {
        setTaskPlan(currAliasId, currTopOp, currTask, false, opProcCtx);
    }
    currTopOp = null;
    currAliasId = null;
    opProcCtx.setCurrTask(currTask);
    opProcCtx.setCurrTopOp(currTopOp);
    opProcCtx.setCurrAliasId(currAliasId);
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) DemuxOperator(org.apache.hadoop.hive.ql.exec.DemuxOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) DependencyCollectionTask(org.apache.hadoop.hive.ql.exec.DependencyCollectionTask) Serializable(java.io.Serializable) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Aggregations

ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)34 ArrayList (java.util.ArrayList)12 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)12 Path (org.apache.hadoop.fs.Path)11 Operator (org.apache.hadoop.hive.ql.exec.Operator)10 BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)10 ReduceSinkDesc (org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)9 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)8 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)8 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)7 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)7 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)6 FileSinkDesc (org.apache.hadoop.hive.ql.plan.FileSinkDesc)6 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)6 SelectDesc (org.apache.hadoop.hive.ql.plan.SelectDesc)6 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)5 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)5 JobConf (org.apache.hadoop.mapred.JobConf)5 IOException (java.io.IOException)4 CommonMergeJoinOperator (org.apache.hadoop.hive.ql.exec.CommonMergeJoinOperator)4