Search in sources :

Example 16 with ReduceWork

use of org.apache.hadoop.hive.ql.plan.ReduceWork in project hive by apache.

the class GenMapRedUtils method initPlan.

/**
 * Initialize the current plan by adding it to root tasks.
 *
 * @param op
 *          the reduce sink operator encountered
 * @param opProcCtx
 *          processing context
 */
public static void initPlan(ReduceSinkOperator op, GenMRProcContext opProcCtx) throws SemanticException {
    Operator<? extends OperatorDesc> reducer = op.getChildOperators().get(0);
    Map<Operator<? extends OperatorDesc>, GenMapRedCtx> mapCurrCtx = opProcCtx.getMapCurrCtx();
    GenMapRedCtx mapredCtx = mapCurrCtx.get(op.getParentOperators().get(0));
    Task<? extends Serializable> currTask = mapredCtx.getCurrTask();
    MapredWork plan = (MapredWork) currTask.getWork();
    HashMap<Operator<? extends OperatorDesc>, Task<? extends Serializable>> opTaskMap = opProcCtx.getOpTaskMap();
    TableScanOperator currTopOp = opProcCtx.getCurrTopOp();
    opTaskMap.put(reducer, currTask);
    plan.setReduceWork(new ReduceWork());
    plan.getReduceWork().setReducer(reducer);
    ReduceSinkDesc desc = op.getConf();
    plan.getReduceWork().setNumReduceTasks(desc.getNumReducers());
    if (needsTagging(plan.getReduceWork())) {
        plan.getReduceWork().setNeedsTagging(true);
    }
    assert currTopOp != null;
    String currAliasId = opProcCtx.getCurrAliasId();
    if (!opProcCtx.isSeenOp(currTask, currTopOp)) {
        setTaskPlan(currAliasId, currTopOp, currTask, false, opProcCtx);
    }
    currTopOp = null;
    currAliasId = null;
    opProcCtx.setCurrTask(currTask);
    opProcCtx.setCurrTopOp(currTopOp);
    opProcCtx.setCurrAliasId(currAliasId);
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) DemuxOperator(org.apache.hadoop.hive.ql.exec.DemuxOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) DependencyCollectionTask(org.apache.hadoop.hive.ql.exec.DependencyCollectionTask) Serializable(java.io.Serializable) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) GenMapRedCtx(org.apache.hadoop.hive.ql.optimizer.GenMRProcContext.GenMapRedCtx) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 17 with ReduceWork

use of org.apache.hadoop.hive.ql.plan.ReduceWork in project hive by apache.

the class GenMapRedUtils method splitPlan.

/**
 * Met cRS in pOP(parentTask with RS)-cRS-cOP(noTask) case
 * Create new child task for cRS-cOP and link two tasks by temporary file : pOP-FS / TS-cRS-cOP
 *
 * @param cRS
 *          the reduce sink operator encountered
 * @param opProcCtx
 *          processing context
 */
static void splitPlan(ReduceSinkOperator cRS, GenMRProcContext opProcCtx) throws SemanticException {
    // Generate a new task
    ParseContext parseCtx = opProcCtx.getParseCtx();
    Task<? extends Serializable> parentTask = opProcCtx.getCurrTask();
    MapredWork childPlan = getMapRedWork(parseCtx);
    Task<? extends Serializable> childTask = TaskFactory.get(childPlan);
    Operator<? extends OperatorDesc> reducer = cRS.getChildOperators().get(0);
    // Add the reducer
    ReduceWork rWork = new ReduceWork();
    childPlan.setReduceWork(rWork);
    rWork.setReducer(reducer);
    ReduceSinkDesc desc = cRS.getConf();
    childPlan.getReduceWork().setNumReduceTasks(new Integer(desc.getNumReducers()));
    opProcCtx.getOpTaskMap().put(reducer, childTask);
    splitTasks(cRS, parentTask, childTask, opProcCtx);
}
Also used : MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)

Example 18 with ReduceWork

use of org.apache.hadoop.hive.ql.plan.ReduceWork in project hive by apache.

the class GenMapRedUtils method isMergeRequiredForMr.

private static boolean isMergeRequiredForMr(HiveConf hconf, FileSinkOperator fsOp, Task<? extends Serializable> currTask) {
    if (fsOp.getConf().isLinkedFileSink()) {
        // possibly by a big margin. So, merge aggresively.
        return (hconf.getBoolVar(ConfVars.HIVEMERGEMAPFILES) || hconf.getBoolVar(ConfVars.HIVEMERGEMAPREDFILES));
    }
    // or for a map-reduce job
    if (currTask.getWork() instanceof MapredWork) {
        ReduceWork reduceWork = ((MapredWork) currTask.getWork()).getReduceWork();
        boolean mergeMapOnly = hconf.getBoolVar(ConfVars.HIVEMERGEMAPFILES) && reduceWork == null;
        boolean mergeMapRed = hconf.getBoolVar(ConfVars.HIVEMERGEMAPREDFILES) && reduceWork != null;
        if (mergeMapOnly || mergeMapRed) {
            return true;
        }
    }
    return false;
}
Also used : MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork)

Example 19 with ReduceWork

use of org.apache.hadoop.hive.ql.plan.ReduceWork in project hive by apache.

the class TestTezTask method setUp.

@SuppressWarnings("unchecked")
@Before
public void setUp() throws Exception {
    utils = mock(DagUtils.class);
    fs = mock(FileSystem.class);
    path = mock(Path.class);
    when(path.getFileSystem(any(Configuration.class))).thenReturn(fs);
    when(utils.getTezDir(any(Path.class))).thenReturn(path);
    when(utils.createVertex(any(JobConf.class), any(BaseWork.class), any(Path.class), any(FileSystem.class), any(Context.class), anyBoolean(), any(TezWork.class), any(VertexType.class), any(Map.class))).thenAnswer(new Answer<Vertex>() {

        @Override
        public Vertex answer(InvocationOnMock invocation) throws Throwable {
            Object[] args = invocation.getArguments();
            return Vertex.create(((BaseWork) args[1]).getName(), mock(ProcessorDescriptor.class), 0, mock(Resource.class));
        }
    });
    when(utils.createEdge(any(JobConf.class), any(Vertex.class), any(Vertex.class), any(TezEdgeProperty.class), any(BaseWork.class), any(TezWork.class))).thenAnswer(new Answer<Edge>() {

        @Override
        public Edge answer(InvocationOnMock invocation) throws Throwable {
            Object[] args = invocation.getArguments();
            return Edge.create((Vertex) args[1], (Vertex) args[2], mock(EdgeProperty.class));
        }
    });
    work = new TezWork("", null);
    mws = new MapWork[] { new MapWork(), new MapWork() };
    rws = new ReduceWork[] { new ReduceWork(), new ReduceWork() };
    work.addAll(mws);
    work.addAll(rws);
    int i = 0;
    for (BaseWork w : work.getAllWork()) {
        w.setName("Work " + (++i));
    }
    op = mock(Operator.class);
    LinkedHashMap<String, Operator<? extends OperatorDesc>> map = new LinkedHashMap<String, Operator<? extends OperatorDesc>>();
    map.put("foo", op);
    mws[0].setAliasToWork(map);
    mws[1].setAliasToWork(map);
    LinkedHashMap<Path, ArrayList<String>> pathMap = new LinkedHashMap<>();
    ArrayList<String> aliasList = new ArrayList<String>();
    aliasList.add("foo");
    pathMap.put(new Path("foo"), aliasList);
    mws[0].setPathToAliases(pathMap);
    mws[1].setPathToAliases(pathMap);
    rws[0].setReducer(op);
    rws[1].setReducer(op);
    TezEdgeProperty edgeProp = new TezEdgeProperty(EdgeType.SIMPLE_EDGE);
    work.connect(mws[0], rws[0], edgeProp);
    work.connect(mws[1], rws[0], edgeProp);
    work.connect(rws[0], rws[1], edgeProp);
    task = new TezTask(utils);
    task.setWork(work);
    task.setConsole(mock(LogHelper.class));
    QueryPlan mockQueryPlan = mock(QueryPlan.class);
    doReturn(UUID.randomUUID().toString()).when(mockQueryPlan).getQueryId();
    task.setQueryPlan(mockQueryPlan);
    conf = new JobConf();
    appLr = createResource("foo.jar");
    HiveConf hiveConf = new HiveConf();
    hiveConf.setVar(HiveConf.ConfVars.HIVE_AUTHORIZATION_MANAGER, "org.apache.hadoop.hive.ql.security.authorization.plugin.sqlstd.SQLStdHiveAuthorizerFactory");
    SessionState.start(hiveConf);
    session = mock(TezClient.class);
    sessionState = mock(TezSessionState.class);
    when(sessionState.getSession()).thenReturn(session);
    when(sessionState.reopen()).thenReturn(sessionState);
    when(session.submitDAG(any(DAG.class))).thenThrow(new SessionNotRunning("")).thenReturn(mock(DAGClient.class));
}
Also used : Operator(org.apache.hadoop.hive.ql.exec.Operator) Vertex(org.apache.tez.dag.api.Vertex) Configuration(org.apache.hadoop.conf.Configuration) LogHelper(org.apache.hadoop.hive.ql.session.SessionState.LogHelper) TezEdgeProperty(org.apache.hadoop.hive.ql.plan.TezEdgeProperty) ArrayList(java.util.ArrayList) QueryPlan(org.apache.hadoop.hive.ql.QueryPlan) LinkedHashMap(java.util.LinkedHashMap) TezClient(org.apache.tez.client.TezClient) SessionNotRunning(org.apache.tez.dag.api.SessionNotRunning) FileSystem(org.apache.hadoop.fs.FileSystem) HiveConf(org.apache.hadoop.hive.conf.HiveConf) JobConf(org.apache.hadoop.mapred.JobConf) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) Path(org.apache.hadoop.fs.Path) Context(org.apache.hadoop.hive.ql.Context) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) InvocationOnMock(org.mockito.invocation.InvocationOnMock) DAGClient(org.apache.tez.dag.api.client.DAGClient) Map(java.util.Map) HashMap(java.util.HashMap) LinkedHashMap(java.util.LinkedHashMap) Edge(org.apache.tez.dag.api.Edge) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) TezWork(org.apache.hadoop.hive.ql.plan.TezWork) VertexType(org.apache.hadoop.hive.ql.plan.TezWork.VertexType) Before(org.junit.Before)

Example 20 with ReduceWork

use of org.apache.hadoop.hive.ql.plan.ReduceWork in project hive by apache.

the class SparkPlanGenerator method cloneJobConf.

@SuppressWarnings({ "unchecked" })
private JobConf cloneJobConf(BaseWork work) throws Exception {
    if (workToJobConf.containsKey(work)) {
        return workToJobConf.get(work);
    }
    JobConf cloned = new JobConf(jobConf);
    // Make sure we'll use a different plan path from the original one
    HiveConf.setVar(cloned, HiveConf.ConfVars.PLAN, "");
    try {
        cloned.setPartitionerClass(JavaUtils.loadClass(HiveConf.getVar(cloned, HiveConf.ConfVars.HIVEPARTITIONER)));
    } catch (ClassNotFoundException e) {
        String msg = "Could not find partitioner class: " + e.getMessage() + " which is specified by: " + HiveConf.ConfVars.HIVEPARTITIONER.varname;
        throw new IllegalArgumentException(msg, e);
    }
    if (work instanceof MapWork) {
        MapWork mapWork = (MapWork) work;
        cloned.setBoolean("mapred.task.is.map", true);
        List<Path> inputPaths = Utilities.getInputPaths(cloned, mapWork, scratchDir, context, false);
        Utilities.setInputPaths(cloned, inputPaths);
        Utilities.setMapWork(cloned, mapWork, scratchDir, false);
        Utilities.createTmpDirs(cloned, mapWork);
        if (work instanceof MergeFileWork) {
            MergeFileWork mergeFileWork = (MergeFileWork) work;
            cloned.set(Utilities.MAPRED_MAPPER_CLASS, MergeFileMapper.class.getName());
            cloned.set("mapred.input.format.class", mergeFileWork.getInputformat());
            cloned.setClass("mapred.output.format.class", MergeFileOutputFormat.class, FileOutputFormat.class);
        } else {
            cloned.set(Utilities.MAPRED_MAPPER_CLASS, ExecMapper.class.getName());
        }
        if (mapWork.getMaxSplitSize() != null) {
            HiveConf.setLongVar(cloned, HiveConf.ConfVars.MAPREDMAXSPLITSIZE, mapWork.getMaxSplitSize());
        }
        if (mapWork.getMinSplitSize() != null) {
            HiveConf.setLongVar(cloned, HiveConf.ConfVars.MAPREDMINSPLITSIZE, mapWork.getMinSplitSize());
        }
        if (mapWork.getMinSplitSizePerNode() != null) {
            HiveConf.setLongVar(cloned, HiveConf.ConfVars.MAPREDMINSPLITSIZEPERNODE, mapWork.getMinSplitSizePerNode());
        }
        if (mapWork.getMinSplitSizePerRack() != null) {
            HiveConf.setLongVar(cloned, HiveConf.ConfVars.MAPREDMINSPLITSIZEPERRACK, mapWork.getMinSplitSizePerRack());
        }
        // remember the JobConf cloned for each MapWork, so we won't clone for it again
        workToJobConf.put(work, cloned);
    } else if (work instanceof ReduceWork) {
        cloned.setBoolean("mapred.task.is.map", false);
        Utilities.setReduceWork(cloned, (ReduceWork) work, scratchDir, false);
        Utilities.createTmpDirs(cloned, (ReduceWork) work);
        cloned.set(Utilities.MAPRED_REDUCER_CLASS, ExecReducer.class.getName());
    }
    return cloned;
}
Also used : Path(org.apache.hadoop.fs.Path) MergeFileWork(org.apache.hadoop.hive.ql.io.merge.MergeFileWork) MergeFileMapper(org.apache.hadoop.hive.ql.io.merge.MergeFileMapper) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) JobConf(org.apache.hadoop.mapred.JobConf) ExecMapper(org.apache.hadoop.hive.ql.exec.mr.ExecMapper)

Aggregations

ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)35 ArrayList (java.util.ArrayList)13 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)13 Path (org.apache.hadoop.fs.Path)11 BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)11 Operator (org.apache.hadoop.hive.ql.exec.Operator)10 ReduceSinkDesc (org.apache.hadoop.hive.ql.plan.ReduceSinkDesc)9 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)8 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)8 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)7 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)7 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)6 FileSinkDesc (org.apache.hadoop.hive.ql.plan.FileSinkDesc)6 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)6 SelectDesc (org.apache.hadoop.hive.ql.plan.SelectDesc)6 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)5 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)5 HiveException (org.apache.hadoop.hive.ql.metadata.HiveException)5 JobConf (org.apache.hadoop.mapred.JobConf)5 IOException (java.io.IOException)4