Search in sources :

Example 76 with OperatorDesc

use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.

the class CommonJoinTaskDispatcher method processCurrentTask.

@Override
public Task<? extends Serializable> processCurrentTask(MapRedTask currTask, ConditionalTask conditionalTask, Context context) throws SemanticException {
    // whether it contains common join op; if contains, return this common join op
    JoinOperator joinOp = getJoinOp(currTask);
    if (joinOp == null || joinOp.getConf().isFixedAsSorted()) {
        return null;
    }
    currTask.setTaskTag(Task.COMMON_JOIN);
    MapWork currWork = currTask.getWork().getMapWork();
    // create conditional work list and task list
    List<Serializable> listWorks = new ArrayList<Serializable>();
    List<Task<? extends Serializable>> listTasks = new ArrayList<Task<? extends Serializable>>();
    // create task to aliases mapping and alias to input file mapping for resolver
    // Must be deterministic order map for consistent q-test output across Java versions
    HashMap<Task<? extends Serializable>, Set<String>> taskToAliases = new LinkedHashMap<Task<? extends Serializable>, Set<String>>();
    HashMap<Path, ArrayList<String>> pathToAliases = currWork.getPathToAliases();
    Map<String, Operator<? extends OperatorDesc>> aliasToWork = currWork.getAliasToWork();
    // get parseCtx for this Join Operator
    ParseContext parseCtx = physicalContext.getParseContext();
    // start to generate multiple map join tasks
    JoinDesc joinDesc = joinOp.getConf();
    if (aliasToSize == null) {
        aliasToSize = new HashMap<String, Long>();
    }
    try {
        long aliasTotalKnownInputSize = getTotalKnownInputSize(context, currWork, pathToAliases, aliasToSize);
        Set<Integer> bigTableCandidates = MapJoinProcessor.getBigTableCandidates(joinDesc.getConds());
        // no table could be the big table; there is no need to convert
        if (bigTableCandidates.isEmpty()) {
            return null;
        }
        // if any of bigTableCandidates is from multi-sourced, bigTableCandidates should
        // only contain multi-sourced because multi-sourced cannot be hashed or direct readable
        bigTableCandidates = multiInsertBigTableCheck(joinOp, bigTableCandidates);
        Configuration conf = context.getConf();
        // If sizes of at least n-1 tables in a n-way join is known, and their sum is smaller than
        // the threshold size, convert the join into map-join and don't create a conditional task
        boolean convertJoinMapJoin = HiveConf.getBoolVar(conf, HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASK);
        int bigTablePosition = -1;
        if (convertJoinMapJoin) {
            // This is the threshold that the user has specified to fit in mapjoin
            long mapJoinSize = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVECONVERTJOINNOCONDITIONALTASKTHRESHOLD);
            Long bigTableSize = null;
            Set<String> aliases = aliasToWork.keySet();
            for (int tablePosition : bigTableCandidates) {
                Operator<?> parent = joinOp.getParentOperators().get(tablePosition);
                Set<String> participants = GenMapRedUtils.findAliases(currWork, parent);
                long sumOfOthers = Utilities.sumOfExcept(aliasToSize, aliases, participants);
                if (sumOfOthers < 0 || sumOfOthers > mapJoinSize) {
                    // some small alias is not known or too big
                    continue;
                }
                if (bigTableSize == null && bigTablePosition >= 0 && tablePosition < bigTablePosition) {
                    // prefer right most alias
                    continue;
                }
                long aliasSize = Utilities.sumOf(aliasToSize, participants);
                if (bigTableSize == null || bigTableSize < 0 || (aliasSize >= 0 && aliasSize >= bigTableSize)) {
                    bigTablePosition = tablePosition;
                    bigTableSize = aliasSize;
                }
            }
        }
        currWork.setLeftInputJoin(joinOp.getConf().isLeftInputJoin());
        currWork.setBaseSrc(joinOp.getConf().getBaseSrc());
        currWork.setMapAliases(joinOp.getConf().getMapAliases());
        if (bigTablePosition >= 0) {
            // create map join task and set big table as bigTablePosition
            MapRedTask newTask = convertTaskToMapJoinTask(currTask.getWork(), bigTablePosition);
            newTask.setTaskTag(Task.MAPJOIN_ONLY_NOBACKUP);
            newTask.setFetchSource(currTask.isFetchSource());
            replaceTask(currTask, newTask);
            // joined with multiple small tables on different keys
            if ((newTask.getChildTasks() != null) && (newTask.getChildTasks().size() == 1)) {
                mergeMapJoinTaskIntoItsChildMapRedTask(newTask, conf);
            }
            return newTask;
        }
        long ThresholdOfSmallTblSizeSum = HiveConf.getLongVar(conf, HiveConf.ConfVars.HIVESMALLTABLESFILESIZE);
        for (int pos = 0; pos < joinOp.getNumParent(); pos++) {
            // this table cannot be big table
            if (!bigTableCandidates.contains(pos)) {
                continue;
            }
            Operator<?> startOp = joinOp.getParentOperators().get(pos);
            Set<String> aliases = GenMapRedUtils.findAliases(currWork, startOp);
            long aliasKnownSize = Utilities.sumOf(aliasToSize, aliases);
            if (cannotConvert(aliasKnownSize, aliasTotalKnownInputSize, ThresholdOfSmallTblSizeSum)) {
                continue;
            }
            MapredWork newWork = SerializationUtilities.clonePlan(currTask.getWork());
            // create map join task and set big table as i
            MapRedTask newTask = convertTaskToMapJoinTask(newWork, pos);
            // add into conditional task
            listWorks.add(newTask.getWork());
            listTasks.add(newTask);
            newTask.setTaskTag(Task.CONVERTED_MAPJOIN);
            newTask.setFetchSource(currTask.isFetchSource());
            // set up backup task
            newTask.setBackupTask(currTask);
            newTask.setBackupChildrenTasks(currTask.getChildTasks());
            // put the mapping task to aliases
            taskToAliases.put(newTask, aliases);
        }
    } catch (Exception e) {
        throw new SemanticException("Generate Map Join Task Error: " + e.getMessage(), e);
    }
    if (listTasks.isEmpty()) {
        return currTask;
    }
    // insert current common join task to conditional task
    listWorks.add(currTask.getWork());
    listTasks.add(currTask);
    // clear JoinTree and OP Parse Context
    currWork.setLeftInputJoin(false);
    currWork.setBaseSrc(null);
    currWork.setMapAliases(null);
    // create conditional task and insert conditional task into task tree
    ConditionalWork cndWork = new ConditionalWork(listWorks);
    ConditionalTask cndTsk = (ConditionalTask) TaskFactory.get(cndWork, parseCtx.getConf());
    cndTsk.setListTasks(listTasks);
    // set resolver and resolver context
    cndTsk.setResolver(new ConditionalResolverCommonJoin());
    ConditionalResolverCommonJoinCtx resolverCtx = new ConditionalResolverCommonJoinCtx();
    resolverCtx.setPathToAliases(pathToAliases);
    resolverCtx.setAliasToKnownSize(aliasToSize);
    resolverCtx.setTaskToAliases(taskToAliases);
    resolverCtx.setCommonJoinTask(currTask);
    resolverCtx.setLocalTmpDir(context.getLocalScratchDir(false));
    resolverCtx.setHdfsTmpDir(context.getMRScratchDir());
    cndTsk.setResolverCtx(resolverCtx);
    // replace the current task with the new generated conditional task
    replaceTaskWithConditionalTask(currTask, cndTsk);
    return cndTsk;
}
Also used : JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) LateralViewForwardOperator(org.apache.hadoop.hive.ql.exec.LateralViewForwardOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Serializable(java.io.Serializable) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) Task(org.apache.hadoop.hive.ql.exec.Task) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) HashSet(java.util.HashSet) Set(java.util.Set) Configuration(org.apache.hadoop.conf.Configuration) ArrayList(java.util.ArrayList) ConditionalWork(org.apache.hadoop.hive.ql.plan.ConditionalWork) LinkedHashMap(java.util.LinkedHashMap) MapRedTask(org.apache.hadoop.hive.ql.exec.mr.MapRedTask) ConditionalResolverCommonJoinCtx(org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin.ConditionalResolverCommonJoinCtx) MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ConditionalTask(org.apache.hadoop.hive.ql.exec.ConditionalTask) ConditionalResolverCommonJoin(org.apache.hadoop.hive.ql.plan.ConditionalResolverCommonJoin) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) Path(org.apache.hadoop.fs.Path) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException) UnsupportedEncodingException(java.io.UnsupportedEncodingException) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ParseContext(org.apache.hadoop.hive.ql.parse.ParseContext) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) JoinDesc(org.apache.hadoop.hive.ql.plan.JoinDesc)

Example 77 with OperatorDesc

use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.

the class SparkMapRecordHandler method close.

@Override
public void close() {
    // No row was processed
    if (oc == null) {
        LOG.trace("Close called. no row processed by map.");
    }
    // check if there are IOExceptions
    if (!abort) {
        abort = execContext.getIoCxt().getIOExceptions();
    }
    // ideally hadoop should let us know whether map execution failed or not
    try {
        mo.close(abort);
        //for close the local work
        if (localWork != null) {
            List<Operator<? extends OperatorDesc>> dummyOps = localWork.getDummyParentOp();
            for (Operator<? extends OperatorDesc> dummyOp : dummyOps) {
                dummyOp.close(abort);
            }
        }
        if (isLogInfoEnabled) {
            logCloseInfo();
        }
        ReportStats rps = new ReportStats(rp, jc);
        mo.preorderMap(rps);
        return;
    } catch (Exception e) {
        if (!abort) {
            // signal new failure to map-reduce
            String msg = "Hit error while closing operators - failing tree: " + e;
            LOG.error(msg, e);
            throw new IllegalStateException(msg, e);
        }
    } finally {
        MapredContext.close();
        Utilities.clearWorkMap(jc);
    }
}
Also used : Operator(org.apache.hadoop.hive.ql.exec.Operator) MapOperator(org.apache.hadoop.hive.ql.exec.MapOperator) AbstractMapOperator(org.apache.hadoop.hive.ql.exec.AbstractMapOperator) VectorMapOperator(org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator) ReportStats(org.apache.hadoop.hive.ql.exec.mr.ExecMapper.ReportStats) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) IOException(java.io.IOException)

Example 78 with OperatorDesc

use of org.apache.hadoop.hive.ql.plan.OperatorDesc in project hive by apache.

the class SemanticAnalyzer method genUnionPlan.

@SuppressWarnings("nls")
private Operator genUnionPlan(String unionalias, String leftalias, Operator leftOp, String rightalias, Operator rightOp) throws SemanticException {
    // Currently, the unions are not merged - each union has only 2 parents. So,
    // a n-way union will lead to (n-1) union operators.
    // This can be easily merged into 1 union
    RowResolver leftRR = opParseCtx.get(leftOp).getRowResolver();
    RowResolver rightRR = opParseCtx.get(rightOp).getRowResolver();
    LinkedHashMap<String, ColumnInfo> leftmap = leftRR.getFieldMap(leftalias);
    LinkedHashMap<String, ColumnInfo> rightmap = rightRR.getFieldMap(rightalias);
    // make sure the schemas of both sides are the same
    ASTNode tabref = qb.getAliases().isEmpty() ? null : qb.getParseInfo().getSrcForAlias(qb.getAliases().get(0));
    if (leftmap.size() != rightmap.size()) {
        throw new SemanticException("Schema of both sides of union should match.");
    }
    RowResolver unionoutRR = new RowResolver();
    Iterator<Map.Entry<String, ColumnInfo>> lIter = leftmap.entrySet().iterator();
    Iterator<Map.Entry<String, ColumnInfo>> rIter = rightmap.entrySet().iterator();
    while (lIter.hasNext()) {
        Map.Entry<String, ColumnInfo> lEntry = lIter.next();
        Map.Entry<String, ColumnInfo> rEntry = rIter.next();
        ColumnInfo lInfo = lEntry.getValue();
        ColumnInfo rInfo = rEntry.getValue();
        // use left alias (~mysql, postgresql)
        String field = lEntry.getKey();
        // try widening conversion, otherwise fail union
        TypeInfo commonTypeInfo = FunctionRegistry.getCommonClassForUnionAll(lInfo.getType(), rInfo.getType());
        if (commonTypeInfo == null) {
            throw new SemanticException(generateErrorMessage(tabref, "Schema of both sides of union should match: Column " + field + " is of type " + lInfo.getType().getTypeName() + " on first table and type " + rInfo.getType().getTypeName() + " on second table"));
        }
        ColumnInfo unionColInfo = new ColumnInfo(lInfo);
        unionColInfo.setType(commonTypeInfo);
        unionoutRR.put(unionalias, field, unionColInfo);
    }
    // For Spark,TEZ we rely on the generated SelectOperator to do the type casting.
    // Consider:
    //    SEL_1 (int)   SEL_2 (int)    SEL_3 (double)
    // If we first merge SEL_1 and SEL_2 into a UNION_1, and then merge UNION_1
    // with SEL_3 to get UNION_2, then no SelectOperator will be inserted. Hence error
    // will happen afterwards. The solution here is to insert one after UNION_1, which
    // cast int to double.
    boolean isMR = HiveConf.getVar(conf, HiveConf.ConfVars.HIVE_EXECUTION_ENGINE).equals("mr");
    if (!isMR || !(leftOp instanceof UnionOperator)) {
        leftOp = genInputSelectForUnion(leftOp, leftmap, leftalias, unionoutRR, unionalias);
    }
    if (!isMR || !(rightOp instanceof UnionOperator)) {
        rightOp = genInputSelectForUnion(rightOp, rightmap, rightalias, unionoutRR, unionalias);
    }
    // else create a new one
    if (leftOp instanceof UnionOperator || (leftOp instanceof SelectOperator && leftOp.getParentOperators() != null && !leftOp.getParentOperators().isEmpty() && leftOp.getParentOperators().get(0) instanceof UnionOperator && ((SelectOperator) leftOp).isIdentitySelect())) {
        if (!(leftOp instanceof UnionOperator)) {
            Operator oldChild = leftOp;
            leftOp = (Operator) leftOp.getParentOperators().get(0);
            leftOp.removeChildAndAdoptItsChildren(oldChild);
        }
        // make left a child of right
        List<Operator<? extends OperatorDesc>> child = new ArrayList<Operator<? extends OperatorDesc>>();
        child.add(leftOp);
        rightOp.setChildOperators(child);
        List<Operator<? extends OperatorDesc>> parent = leftOp.getParentOperators();
        parent.add(rightOp);
        UnionDesc uDesc = ((UnionOperator) leftOp).getConf();
        uDesc.setNumInputs(uDesc.getNumInputs() + 1);
        return putOpInsertMap(leftOp, unionoutRR);
    }
    if (rightOp instanceof UnionOperator || (rightOp instanceof SelectOperator && rightOp.getParentOperators() != null && !rightOp.getParentOperators().isEmpty() && rightOp.getParentOperators().get(0) instanceof UnionOperator && ((SelectOperator) rightOp).isIdentitySelect())) {
        if (!(rightOp instanceof UnionOperator)) {
            Operator oldChild = rightOp;
            rightOp = (Operator) rightOp.getParentOperators().get(0);
            rightOp.removeChildAndAdoptItsChildren(oldChild);
        }
        // make right a child of left
        List<Operator<? extends OperatorDesc>> child = new ArrayList<Operator<? extends OperatorDesc>>();
        child.add(rightOp);
        leftOp.setChildOperators(child);
        List<Operator<? extends OperatorDesc>> parent = rightOp.getParentOperators();
        parent.add(leftOp);
        UnionDesc uDesc = ((UnionOperator) rightOp).getConf();
        uDesc.setNumInputs(uDesc.getNumInputs() + 1);
        return putOpInsertMap(rightOp, unionoutRR);
    }
    // Create a new union operator
    Operator<? extends OperatorDesc> unionforward = OperatorFactory.getAndMakeChild(getOpContext(), new UnionDesc(), new RowSchema(unionoutRR.getColumnInfos()));
    // set union operator as child of each of leftOp and rightOp
    List<Operator<? extends OperatorDesc>> child = new ArrayList<Operator<? extends OperatorDesc>>();
    child.add(unionforward);
    rightOp.setChildOperators(child);
    child = new ArrayList<Operator<? extends OperatorDesc>>();
    child.add(unionforward);
    leftOp.setChildOperators(child);
    List<Operator<? extends OperatorDesc>> parent = new ArrayList<Operator<? extends OperatorDesc>>();
    parent.add(leftOp);
    parent.add(rightOp);
    unionforward.setParentOperators(parent);
    // create operator info list to return
    return putOpInsertMap(unionforward, unionoutRR);
}
Also used : AbstractMapJoinOperator(org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) UnionDesc(org.apache.hadoop.hive.ql.plan.UnionDesc) RowSchema(org.apache.hadoop.hive.ql.exec.RowSchema) ArrayList(java.util.ArrayList) ColumnInfo(org.apache.hadoop.hive.ql.exec.ColumnInfo) TypeInfo(org.apache.hadoop.hive.serde2.typeinfo.TypeInfo) PrimitiveTypeInfo(org.apache.hadoop.hive.serde2.typeinfo.PrimitiveTypeInfo) Entry(java.util.Map.Entry) SelectOperator(org.apache.hadoop.hive.ql.exec.SelectOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) Map(java.util.Map) LinkedHashMap(java.util.LinkedHashMap) HashMap(java.util.HashMap) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) CalciteSemanticException(org.apache.hadoop.hive.ql.optimizer.calcite.CalciteSemanticException)

Aggregations

OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)78 Operator (org.apache.hadoop.hive.ql.exec.Operator)65 ArrayList (java.util.ArrayList)47 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)41 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)36 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)32 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)30 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)29 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)22 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)21 Path (org.apache.hadoop.fs.Path)20 LinkedHashMap (java.util.LinkedHashMap)18 HashMap (java.util.HashMap)16 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)16 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)16 Serializable (java.io.Serializable)15 Task (org.apache.hadoop.hive.ql.exec.Task)15 List (java.util.List)14 Map (java.util.Map)13 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)13