Search in sources :

Example 41 with MapredWork

use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.

the class CommonJoinTaskDispatcher method processCurrentTask.

@Override
public Task<? extends Serializable> processCurrentTask(MapRedTask currTask, ConditionalTask conditionalTask, Context context) throws SemanticException {
    // whether it contains common join op; if contains, return this common join op
    JoinOperator joinOp = getJoinOp(currTask);
    if (joinOp == null || joinOp.getConf().isFixedAsSorted()) {
        return null;
    }
    currTask.setTaskTag(Task.COMMON_JOIN);
    MapWork currWork = currTask.getWork().getMapWork();
    // create conditional work list and task list
    List<Serializable> listWorks = new ArrayList<Serializable>();
    List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
    // create task to aliases mapping and alias to input file mapping for resolver
    // Must be deterministic order map for consistent q-test output across Java versions
    HashMap<Task<? extends Serializable>, Set<String>> taskToAliases = new LinkedHashMap<Task<? extends Serializable>, Set<String>>();
    HashMap<Path, ArrayList<String>> pathToAliases = currWork.getPathToAliases();
    Map<String, Operator<? extends OperatorDesc>> aliasToWork = currWork.getAliasToWork();
    // get parseCtx for this Join Operator
    ParseContext parseCtx = physicalContext.getParseContext();
    // start to generate multiple map join tasks
    JoinDesc joinDesc = joinOp.getConf();
    if (aliasToSize == null) {
        aliasToSize = new HashMap<String, Long>();
    }
    try {
        long aliasTotalKnownInputSize = getTotalKnownInputSize(context, currWork, pathToAliases, aliasToSize);
        Set<Integer> bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc.getConds());
        // no table could be the big table; there is no need to convert
        if (bigTableCandidates.isEmpty()) {
            return null;
        }
        // if any of bigTableCandidates is from multi-sourced, bigTableCandidates should
        // only contain multi-sourced because multi-sourced cannot be hashed or direct readable
        bigTableCandidates = multiInsertBigTableCheck(joinOp, bigTableCandidates);
        Configuration conf = context.getConf();
        // If sizes of at least n-1 tables in a n-way join is known, and their sum is smaller than
        // the threshold size, convert the join into map-join and don't create a conditional task
        boolean convertJoinMapJoin = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASK);
        int bigTablePosition = -1;
        if (convertJoinMapJoin) {
            // This is the threshold that the user has specified to fit in mapjoin
            long mapJoinSize = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD);
            Long bigTableSize = null;
            Set<String> aliases = aliasToWork.keySet();
            for (int tablePosition : bigTableCandidates) {
                Operator<?> parent = joinOp.getParentOperators().get(tablePosition);
                Set<String> participants = GenMapRedUtils.findAliases(currWork, parent);
                long sumOfOthers = Utilities.sumOfExcept(aliasToSize, aliases, participants);
                if (sumOfOthers < 0 || sumOfOthers > mapJoinSize) {
                    // some small alias is not known or too big
                    continue;
                }
                if (bigTableSize == null && bigTablePosition >= 0 && tablePosition < bigTablePosition) {
                    // prefer right most alias
                    continue;
                }
                long aliasSize = Utilities.sumOf(aliasToSize, participants);
                if (bigTableSize == null || bigTableSize < 0 || (aliasSize >= 0 && aliasSize >= bigTableSize)) {
                    bigTablePosition = tablePosition;
                    bigTableSize = aliasSize;
                }
            }
        }
        currWork.setLeftInputJoin(joinOp.getConf().isLeftInputJoin());
        currWork.setBaseSrc(joinOp.getConf().getBaseSrc());
        currWork.setMapAliases(joinOp.getConf().getMapAliases());
        if (bigTablePosition >= 0) {
            // create map join task and set big table as bigTablePosition
            MapRedTask newTask = convertTaskToMapJoinTask(currTask.getWork(), bigTablePosition);
            newTask.setTaskTag(Task.MAPJOIN_ONLY_NOBACKUP);
            newTask.setFetchSource(currTask.isFetchSource());
            replaceTask(currTask, newTask);
            // joined with multiple small tables on different keys
            if ((newTask.getChildTasks() != null) && (newTask.getChildTasks().size() == 1)) {
                mergeMapJoinTaskIntoItsChildMapRedTask(newTask, conf);
            }
            return newTask;
        }
        long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
        for (int pos = 0; pos < joinOp.getNumParent(); pos++) {
            // this table cannot be big table
            if (!bigTableCandidates.contains(pos)) {
                continue;
            }
            Operator<?> startOp = joinOp.getParentOperators().get(pos);
            Set<String> aliases = GenMapRedUtils.findAliases(currWork, startOp);
            long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
            if (cannotConvert(aliasKnownSize, aliasTotalKnownInputSize, ThresholdOfSmallTblSizeSum)) {
                continue;
            }
            MapredWork newWork = SerializationUtilities.clonePlan(currTask.getWork());
            // create map join task and set big table as i
            MapRedTask newTask = convertTaskToMapJoinTask(newWork, pos);
            // add into conditional task
            listWorks.add(newTask.getWork());
            listTasks.add(newTask);
            newTask.setTaskTag(Task.CONVERTED_MAPJOIN);
            newTask.setFetchSource(currTask.isFetchSource());
            // set up backup task
            newTask.setBackupTask(currTask);
            newTask.setBackupChildrenTasks(currTask.getChildTasks());
            // put the mapping task to aliases
            taskToAliases.put(newTask, aliases);
        }
    } catch (Exception e) {
        throw new SemanticException("Generate Map Join Task Error: " + e.getMessage(), e);
    }
    if (listTasks.isEmpty()) {
        return currTask;
    }
    // insert current common join task to conditional task
    listWorks.add(currTask.getWork());
    listTasks.add(currTask);
    // clear JoinTree and OP Parse Context
    currWork.setLeftInputJoin(false);
    currWork.setBaseSrc(null);
    currWork.setMapAliases(null);
    // create conditional task and insert conditional task into task tree
    ConditionalWork cndWork = new ConditionalWork(listWorks);
    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, parseCtx.getConf());
    cndTsk.setListTasks(listTasks);
    // set resolver and resolver context
    cndTsk.setResolver(new ConditionalResolverCommonJoin());
    ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx();
    resolverCtx.setPathToAliases(pathToAliases);
    resolverCtx.setAliasToKnownSize(aliasToSize);
    resolverCtx.setTaskToAliases(taskToAliases);
    resolverCtx.setCommonJoinTask(currTask);
    resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false));
    resolverCtx.setHdfsTmpDir(context.getMRScratchDir());
    cndTsk.setResolverCtx(resolverCtx);
    // replace the current task with the new generated conditional task
    replaceTaskWithConditionalTask(currTask, cndTsk);
    return cndTsk;
}
Also used : JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) LateralViewForwardOperator(org.apache.hadoop.hive.ql.exec.LateralViewForwardOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Serializable(java.io.Serializable) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) HashSet(java.util.HashSet) Set(java.util.Set) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) ConditionalWork(org.apache.hadoop.hive.ql.plan.ConditionalWork) LinkedHashMap(java.util.LinkedHashMap) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) ConditionalResolverCommonJoinCtx(org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin.ConditionalResolverCommonJoinCtx) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) ConditionalResolverCommonJoin(org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) Path(org.apache.hadoop.fs.Path) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) JoinDesc(org.apache.hadoop.hive.ql.plan.JoinDesc)

Example 42 with MapredWork

use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.

the class SamplingOptimizer method resolve.

public PhysicalContext resolve(PhysicalContext pctx) throws SemanticException {
    for (Task<?> task : pctx.getRootTasks()) {
        if (!(task instanceof MapRedTask) || !((MapRedTask) task).getWork().isFinalMapRed()) {
            // this could be replaced by bucketing on RS + bucketed fetcher for next MR
            continue;
        }
        MapredWork mrWork = ((MapRedTask) task).getWork();
        MapWork mapWork = mrWork.getMapWork();
        ReduceWork reduceWork = mrWork.getReduceWork();
        if (reduceWork == null || reduceWork.getNumReduceTasks() != 1 || mapWork.getAliasToWork().size() != 1 || mapWork.getSamplingType() > 0 || reduceWork.getReducer() == null) {
            continue;
        }
        // GROUPBY operator in reducer may not be processed in parallel. Skip optimizing.
        if (OperatorUtils.findSingleOperator(reduceWork.getReducer(), GroupByOperator.class) != null) {
            continue;
        }
        Operator<?> operator = mapWork.getAliasToWork().values().iterator().next();
        if (!(operator instanceof TableScanOperator)) {
            continue;
        }
        ReduceSinkOperator child = OperatorUtils.findSingleOperator(operator, ReduceSinkOperator.class);
        if (child == null || child.getConf().getNumReducers() != 1 || !child.getConf().getPartitionCols().isEmpty()) {
            continue;
        }
        child.getConf().setNumReducers(-1);
        reduceWork.setNumReduceTasks(-1);
        mapWork.setSamplingType(MapWork.SAMPLING_ON_START);
    }
    return pctx;
}
Also used : MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork)

Example 43 with MapredWork

use of org.apache.hadoop.hive.ql.plan.MapredWork in project hive by apache.

the class SortMergeJoinTaskDispatcher method convertSMBWorkToJoinWork.

/*
   * Convert the work containing to sort-merge join into a work, as if it had a regular join.
   * Note that the operator tree is not changed - is still contains the SMB join, but the
   * plan is changed (aliasToWork etc.) to contain all the paths as if it was a regular join.
   */
private MapredWork convertSMBWorkToJoinWork(MapredWork currWork, SMBMapJoinOperator oldSMBJoinOp) throws SemanticException {
    try {
        // deep copy a new mapred work
        MapredWork currJoinWork = SerializationUtilities.clonePlan(currWork);
        SMBMapJoinOperator newSMBJoinOp = getSMBMapJoinOp(currJoinWork);
        // change the newly created map-red plan as if it was a join operator
        genSMBJoinWork(currJoinWork.getMapWork(), newSMBJoinOp);
        return currJoinWork;
    } catch (Exception e) {
        e.printStackTrace();
        throw new SemanticException("Generate Map Join Task Error: " + e.getMessage());
    }
}
Also used : MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Aggregations

MapredWork (org.apache.hadoop.hive.ql.plan.MapredWork)43 Path (org.apache.hadoop.fs.Path)17 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)13 Serializable (java.io.Serializable)12 Operator (org.apache.hadoop.hive.ql.exec.Operator)12 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)12 Task (org.apache.hadoop.hive.ql.exec.Task)12 MapRedTask (org.apache.hadoop.hive.ql.exec.mr.MapRedTask)12 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)12 ArrayList (java.util.ArrayList)10 PartitionDesc (org.apache.hadoop.hive.ql.plan.PartitionDesc)10 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)10 LinkedHashMap (java.util.LinkedHashMap)9 ConditionalTask (org.apache.hadoop.hive.ql.exec.ConditionalTask)9 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)9 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)8 ParseContext (org.apache.hadoop.hive.ql.parse.ParseContext)8 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)7 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)7 ReduceWork (org.apache.hadoop.hive.ql.plan.ReduceWork)7