Examples with MapRedTask - org.apache.hadoop.hive.ql.exec.mr.MapRedTask

Example 16 with MapRedTask

use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.

the class CrossProductHandler method dispatch.

@Override
public Object dispatch(Node nd, Stack<Node> stack, Object... nodeOutputs) throws SemanticException {
    @SuppressWarnings("unchecked") Task<?> currTask = (Task<?>) nd;
    if (currTask instanceof MapRedTask) {
        MapRedTask mrTsk = (MapRedTask) currTask;
        MapredWork mrWrk = mrTsk.getWork();
        checkMapJoins(mrTsk);
        checkMRReducer(currTask.toString(), mrWrk);
    } else if (currTask instanceof ConditionalTask) {
        List<Task<?>> taskListInConditionalTask = ((ConditionalTask) currTask).getListTasks();
        for (Task<?> tsk : taskListInConditionalTask) {
            dispatch(tsk, stack, nodeOutputs);
        }
    } else if (currTask instanceof TezTask) {
        TezTask tezTask = (TezTask) currTask;
        TezWork tezWork = tezTask.getWork();
        checkMapJoins(tezWork);
        checkTezReducer(tezWork);
    }
    return null;
}

Also used : MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) TezTask(org.apache.hadoop.hive.ql.exec.tez.TezTask) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) ArrayList(java.util.ArrayList) List(java.util.List) TezTask(org.apache.hadoop.hive.ql.exec.tez.TezTask)

Example 17 with MapRedTask

use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.

the class CommonJoinTaskDispatcher method processCurrentTask.

@Override
public Task<?> processCurrentTask(MapRedTask currTask, ConditionalTask conditionalTask, Context context) throws SemanticException {
    // whether it contains common join op; if contains, return this common join op
    JoinOperator joinOp = getJoinOp(currTask);
    if (joinOp == null || joinOp.getConf().isFixedAsSorted()) {
        return null;
    }
    currTask.setTaskTag(Task.COMMON_JOIN);
    MapWork currWork = currTask.getWork().getMapWork();
    // create conditional work list and task list
    List<Serializable> listWorks = new ArrayList<Serializable>();
    List<Task<?>> listTasks = new ArrayList<Task<?>>();
    // create task to aliases mapping and alias to input file mapping for resolver
    // Must be deterministic order map for consistent q-test output across Java versions
    HashMap<Task<?>, Set<String>> taskToAliases = new LinkedHashMap<Task<?>, Set<String>>();
    Map<Path, List<String>> pathToAliases = currWork.getPathToAliases();
    Map<String, Operator<? extends OperatorDesc>> aliasToWork = currWork.getAliasToWork();
    // start to generate multiple map join tasks
    JoinDesc joinDesc = joinOp.getConf();
    if (aliasToSize == null) {
        aliasToSize = new HashMap<String, Long>();
    }
    try {
        long aliasTotalKnownInputSize = getTotalKnownInputSize(context, currWork, pathToAliases, aliasToSize);
        Set<Integer> bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc.getConds());
        // no table could be the big table; there is no need to convert
        if (bigTableCandidates.isEmpty()) {
            return null;
        }
        // if any of bigTableCandidates is from multi-sourced, bigTableCandidates should
        // only contain multi-sourced because multi-sourced cannot be hashed or direct readable
        bigTableCandidates = multiInsertBigTableCheck(joinOp, bigTableCandidates);
        Configuration conf = context.getConf();
        // If sizes of at least n-1 tables in a n-way join is known, and their sum is smaller than
        // the threshold size, convert the join into map-join and don't create a conditional task
        boolean convertJoinMapJoin = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASK);
        int bigTablePosition = -1;
        if (convertJoinMapJoin) {
            // This is the threshold that the user has specified to fit in mapjoin
            long mapJoinSize = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD);
            Long bigTableSize = null;
            Set<String> aliases = aliasToWork.keySet();
            for (int tablePosition : bigTableCandidates) {
                Operator<?> parent = joinOp.getParentOperators().get(tablePosition);
                Set<String> participants = GenMapRedUtils.findAliases(currWork, parent);
                long sumOfOthers = Utilities.sumOfExcept(aliasToSize, aliases, participants);
                if (sumOfOthers < 0 || sumOfOthers > mapJoinSize) {
                    // some small alias is not known or too big
                    continue;
                }
                if (bigTableSize == null && bigTablePosition >= 0 && tablePosition < bigTablePosition) {
                    // prefer right most alias
                    continue;
                }
                long aliasSize = Utilities.sumOf(aliasToSize, participants);
                if (bigTableSize == null || bigTableSize < 0 || (aliasSize >= 0 && aliasSize >= bigTableSize)) {
                    bigTablePosition = tablePosition;
                    bigTableSize = aliasSize;
                }
            }
        }
        currWork.setLeftInputJoin(joinOp.getConf().isLeftInputJoin());
        currWork.setBaseSrc(joinOp.getConf().getBaseSrc());
        currWork.setMapAliases(joinOp.getConf().getMapAliases());
        if (bigTablePosition >= 0) {
            // create map join task and set big table as bigTablePosition
            MapRedTask newTask = convertTaskToMapJoinTask(currTask.getWork(), bigTablePosition);
            newTask.setTaskTag(Task.MAPJOIN_ONLY_NOBACKUP);
            newTask.setFetchSource(currTask.isFetchSource());
            replaceTask(currTask, newTask);
            // joined with multiple small tables on different keys
            if ((newTask.getChildTasks() != null) && (newTask.getChildTasks().size() == 1)) {
                mergeMapJoinTaskIntoItsChildMapRedTask(newTask, conf);
            }
            return newTask;
        }
        long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
        for (int pos = 0; pos < joinOp.getNumParent(); pos++) {
            // this table cannot be big table
            if (!bigTableCandidates.contains(pos)) {
                continue;
            }
            Operator<?> startOp = joinOp.getParentOperators().get(pos);
            Set<String> aliases = GenMapRedUtils.findAliases(currWork, startOp);
            long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
            if (cannotConvert(aliasKnownSize, aliasTotalKnownInputSize, ThresholdOfSmallTblSizeSum)) {
                continue;
            }
            MapredWork newWork = SerializationUtilities.clonePlan(currTask.getWork());
            // create map join task and set big table as i
            MapRedTask newTask = convertTaskToMapJoinTask(newWork, pos);
            // add into conditional task
            listWorks.add(newTask.getWork());
            listTasks.add(newTask);
            newTask.setTaskTag(Task.CONVERTED_MAPJOIN);
            newTask.setFetchSource(currTask.isFetchSource());
            // set up backup task
            newTask.setBackupTask(currTask);
            newTask.setBackupChildrenTasks(currTask.getChildTasks());
            // put the mapping task to aliases
            taskToAliases.put(newTask, aliases);
        }
    } catch (Exception e) {
        throw new SemanticException("Generate Map Join Task Error: " + e.getMessage(), e);
    }
    if (listTasks.isEmpty()) {
        return currTask;
    }
    // insert current common join task to conditional task
    listWorks.add(currTask.getWork());
    listTasks.add(currTask);
    // clear JoinTree and OP Parse Context
    currWork.setLeftInputJoin(false);
    currWork.setBaseSrc(null);
    currWork.setMapAliases(null);
    // create conditional task and insert conditional task into task tree
    ConditionalWork cndWork = new ConditionalWork(listWorks);
    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork);
    cndTsk.setListTasks(listTasks);
    // set resolver and resolver context
    cndTsk.setResolver(new ConditionalResolverCommonJoin());
    ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx();
    resolverCtx.setPathToAliases(pathToAliases);
    resolverCtx.setAliasToKnownSize(aliasToSize);
    resolverCtx.setTaskToAliases(taskToAliases);
    resolverCtx.setCommonJoinTask(currTask);
    resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false));
    resolverCtx.setHdfsTmpDir(context.getMRScratchDir());
    cndTsk.setResolverCtx(resolverCtx);
    // replace the current task with the new generated conditional task
    replaceTaskWithConditionalTask(currTask, cndTsk);
    return cndTsk;
}

Also used : JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) LateralViewForwardOperator(org.apache.hadoop.hive.ql.exec.LateralViewForwardOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Serializable(java.io.Serializable) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) HashSet(java.util.HashSet) Set(java.util.Set) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) ConditionalWork(org.apache.hadoop.hive.ql.plan.ConditionalWork) LinkedHashMap(java.util.LinkedHashMap) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) ConditionalResolverCommonJoinCtx(org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin.ConditionalResolverCommonJoinCtx) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) ArrayList(java.util.ArrayList) List(java.util.List) ConditionalResolverCommonJoin(org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) Path(org.apache.hadoop.fs.Path) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) JoinDesc(org.apache.hadoop.hive.ql.plan.JoinDesc)

Example 18 with MapRedTask

use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.

the class CommonJoinTaskDispatcher method mergeMapJoinTaskIntoItsChildMapRedTask.

/*
   * A task and its child task has been converted from join to mapjoin.
   * See if the two tasks can be merged.
   */
private void mergeMapJoinTaskIntoItsChildMapRedTask(MapRedTask mapJoinTask, Configuration conf) throws SemanticException {
    // If so, check if we can merge mapJoinTask into that child.
    if (mapJoinTask.getChildTasks() == null || mapJoinTask.getChildTasks().size() > 1) {
        // child-tasks in which case we don't want to do anything.
        return;
    }
    Task<?> childTask = mapJoinTask.getChildTasks().get(0);
    if (!(childTask instanceof MapRedTask)) {
        // Nothing to do if it is not a MapReduce task.
        return;
    }
    MapRedTask childMapRedTask = (MapRedTask) childTask;
    MapWork mapJoinMapWork = mapJoinTask.getWork().getMapWork();
    MapWork childMapWork = childMapRedTask.getWork().getMapWork();
    Map<String, Operator<? extends OperatorDesc>> mapJoinAliasToWork = mapJoinMapWork.getAliasToWork();
    if (mapJoinAliasToWork.size() > 1) {
        // Do not merge if the MapredWork of MapJoin has multiple input aliases.
        return;
    }
    Entry<String, Operator<? extends OperatorDesc>> mapJoinAliasToWorkEntry = mapJoinAliasToWork.entrySet().iterator().next();
    String mapJoinAlias = mapJoinAliasToWorkEntry.getKey();
    TableScanOperator mapJoinTaskTableScanOperator = OperatorUtils.findSingleOperator(mapJoinAliasToWorkEntry.getValue(), TableScanOperator.class);
    if (mapJoinTaskTableScanOperator == null) {
        throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() + " operator as the work associated with alias " + mapJoinAlias + ". Found a " + mapJoinAliasToWork.get(mapJoinAlias).getName() + " operator.");
    }
    Set<FileSinkOperator> mapJoinTaskFileSinkOperators = OperatorUtils.findOperators(mapJoinTaskTableScanOperator, FileSinkOperator.class);
    if (mapJoinTaskFileSinkOperators.isEmpty()) {
        throw new SemanticException("Cannot find the " + FileSinkOperator.getOperatorName() + " operator at the last operator of the MapJoin Task.");
    }
    if (mapJoinTaskFileSinkOperators.size() > 1) {
        LOG.warn("Multiple " + FileSinkOperator.getOperatorName() + " operators found at the last operator of the MapJoin Task.");
        return;
    }
    // The mapJoinTaskFileSinkOperator writes to a different directory
    FileSinkOperator mapJoinTaskFileSinkOperator = mapJoinTaskFileSinkOperators.iterator().next();
    Path childMRPath = mapJoinTaskFileSinkOperator.getConf().getDirName();
    List<String> childMRAliases = childMapWork.getPathToAliases().get(childMRPath);
    if (childMRAliases == null || childMRAliases.size() != 1) {
        return;
    }
    String childMRAlias = childMRAliases.get(0);
    // Sanity check to make sure there is no alias conflict after merge.
    for (Entry<Path, List<String>> entry : childMapWork.getPathToAliases().entrySet()) {
        Path path = entry.getKey();
        List<String> aliases = entry.getValue();
        if (path.equals(childMRPath)) {
            continue;
        }
        if (aliases.contains(mapJoinAlias)) {
            // alias confict should not happen here.
            return;
        }
    }
    MapredLocalWork mapJoinLocalWork = mapJoinMapWork.getMapRedLocalWork();
    MapredLocalWork childLocalWork = childMapWork.getMapRedLocalWork();
    if ((mapJoinLocalWork != null && mapJoinLocalWork.getBucketMapjoinContext() != null) || (childLocalWork != null && childLocalWork.getBucketMapjoinContext() != null)) {
        // We should relax this constraint with a follow-up jira.
        return;
    }
    // is under the limit.
    if (!isLocalTableTotalSizeUnderLimitAfterMerge(conf, mapJoinLocalWork, childLocalWork)) {
        // Do not merge.
        return;
    }
    TableScanOperator childMRTaskTableScanOperator = OperatorUtils.findSingleOperator(childMapWork.getAliasToWork().get(childMRAlias.toString()), TableScanOperator.class);
    if (childMRTaskTableScanOperator == null) {
        throw new SemanticException("Expected a " + TableScanOperator.getOperatorName() + " operator as the work associated with alias " + childMRAlias + ". Found a " + childMapWork.getAliasToWork().get(childMRAlias).getName() + " operator.");
    }
    List<Operator<? extends OperatorDesc>> parentsInMapJoinTask = mapJoinTaskFileSinkOperator.getParentOperators();
    List<Operator<? extends OperatorDesc>> childrenInChildMRTask = childMRTaskTableScanOperator.getChildOperators();
    if (parentsInMapJoinTask.size() > 1 || childrenInChildMRTask.size() > 1) {
        // Do not merge if we do not know how to connect two operator trees.
        return;
    }
    // Step 2: Merge mapJoinTask into the Map-side of its child.
    // Step 2.1: Connect the operator trees of two MapRedTasks.
    Operator<? extends OperatorDesc> parentInMapJoinTask = parentsInMapJoinTask.get(0);
    Operator<? extends OperatorDesc> childInChildMRTask = childrenInChildMRTask.get(0);
    parentInMapJoinTask.replaceChild(mapJoinTaskFileSinkOperator, childInChildMRTask);
    childInChildMRTask.replaceParent(childMRTaskTableScanOperator, parentInMapJoinTask);
    // Step 2.2: Replace the corresponding part childMRWork's MapWork.
    GenMapRedUtils.replaceMapWork(mapJoinAlias, childMRAlias.toString(), mapJoinMapWork, childMapWork);
    // Step 2.3: Fill up stuff in local work
    if (mapJoinLocalWork != null) {
        if (childLocalWork == null) {
            childMapWork.setMapRedLocalWork(mapJoinLocalWork);
        } else {
            childLocalWork.getAliasToFetchWork().putAll(mapJoinLocalWork.getAliasToFetchWork());
            childLocalWork.getAliasToWork().putAll(mapJoinLocalWork.getAliasToWork());
        }
    }
    // Step 2.4: Remove this MapJoin task
    List<Task<?>> parentTasks = mapJoinTask.getParentTasks();
    mapJoinTask.setParentTasks(null);
    mapJoinTask.setChildTasks(null);
    childMapRedTask.getParentTasks().remove(mapJoinTask);
    if (parentTasks != null) {
        childMapRedTask.getParentTasks().addAll(parentTasks);
        for (Task<?> parentTask : parentTasks) {
            parentTask.getChildTasks().remove(mapJoinTask);
            if (!parentTask.getChildTasks().contains(childMapRedTask)) {
                parentTask.getChildTasks().add(childMapRedTask);
            }
        }
    } else {
        if (physicalContext.getRootTasks().contains(mapJoinTask)) {
            physicalContext.removeFromRootTask(mapJoinTask);
            if (childMapRedTask.getParentTasks() != null && childMapRedTask.getParentTasks().size() == 0 && !physicalContext.getRootTasks().contains(childMapRedTask)) {
                physicalContext.addToRootTask(childMapRedTask);
            }
        }
    }
    if (childMapRedTask.getParentTasks().size() == 0) {
        childMapRedTask.setParentTasks(null);
    }
}

Also used : LateralViewForwardOperator(org.apache.hadoop.hive.ql.exec.LateralViewForwardOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Path(org.apache.hadoop.fs.Path) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) ArrayList(java.util.ArrayList) List(java.util.List) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 19 with MapRedTask

use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.

the class TestExecDriver method executePlan.

private void executePlan() throws Exception {
    String testName = new Exception().getStackTrace()[1].getMethodName();
    MapRedTask mrtask = new MapRedTask();
    TaskQueue taskQueue = new TaskQueue();
    mrtask.setWork(mr);
    mrtask.initialize(queryState, null, taskQueue, null);
    int exitVal = mrtask.execute();
    if (exitVal != 0) {
        LOG.error(testName + " execution failed with exit status: " + exitVal);
        assertEquals(true, false);
    }
    LOG.info(testName + " execution completed successfully");
}

Also used : MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) TaskQueue(org.apache.hadoop.hive.ql.TaskQueue) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 20 with MapRedTask

use of org.apache.hadoop.hive.ql.exec.mr.MapRedTask in project hive by apache.

the class GenMapRedUtils method joinPlan.

/**
 * Merge the current task into the old task for the reducer
 *
 * @param currTask
 *          the current task for the current reducer
 * @param oldTask
 *          the old task for the current reducer
 * @param opProcCtx
 *          processing context
 */
public static void joinPlan(Task<?> currTask, Task<?> oldTask, GenMRProcContext opProcCtx) throws SemanticException {
    assert currTask != null && oldTask != null;
    TableScanOperator currTopOp = opProcCtx.getCurrTopOp();
    List<Task<?>> parTasks = null;
    // terminate the old task and make current task dependent on it
    if (currTask.getParentTasks() != null && !currTask.getParentTasks().isEmpty()) {
        parTasks = new ArrayList<Task<?>>();
        parTasks.addAll(currTask.getParentTasks());
        Object[] parTaskArr = parTasks.toArray();
        for (Object element : parTaskArr) {
            ((Task<?>) element).removeDependentTask(currTask);
        }
    }
    if (currTopOp != null) {
        mergeInput(currTopOp, opProcCtx, oldTask, false);
    }
    if (parTasks != null) {
        for (Task<?> parTask : parTasks) {
            parTask.addDependentTask(oldTask);
        }
    }
    if (oldTask instanceof MapRedTask && currTask instanceof MapRedTask) {
        ((MapRedTask) currTask).getWork().getMapWork().mergingInto(((MapRedTask) oldTask).getWork().getMapWork());
    }
    opProcCtx.setCurrTopOp(null);
    opProcCtx.setCurrTask(oldTask);
}

Also used : MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) SparkTask(org.apache.hadoop.hive.ql.exec.spark.SparkTask) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) TezTask(org.apache.hadoop.hive.ql.exec.tez.TezTask) Task(org.apache.hadoop.hive.ql.exec.Task) MoveTask(org.apache.hadoop.hive.ql.exec.MoveTask) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) DependencyCollectionTask(org.apache.hadoop.hive.ql.exec.DependencyCollectionTask)

Aggregations

MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)21 Task (org.apache.hadoop.hive.ql.exec.Task)9 MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)9 ArrayList (java.util.ArrayList)8 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)8 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)8 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)8 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)7 List (java.util.List)6 Operator (org.apache.hadoop.hive.ql.exec.Operator)5 Serializable (java.io.Serializable)4 Path (org.apache.hadoop.fs.Path)4 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)4 IOException (java.io.IOException)3 HashSet (java.util.HashSet)3 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)3 TezTask (org.apache.hadoop.hive.ql.exec.tez.TezTask)3 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)3 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)3 HashMap (java.util.HashMap)2