Search in sources :

Example 6 with HashTableDummyOperator

use of org.apache.hadoop.hive.ql.exec.HashTableDummyOperator in project hive by apache.

the class GenSparkWork method process.

@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
    GenSparkProcContext context = (GenSparkProcContext) procContext;
    Preconditions.checkArgument(context != null, "AssertionError: expected context to be not null");
    Preconditions.checkArgument(context.currentTask != null, "AssertionError: expected context.currentTask to be not null");
    Preconditions.checkArgument(context.currentRootOperator != null, "AssertionError: expected context.currentRootOperator to be not null");
    // Operator is a file sink or reduce sink. Something that forces a new vertex.
    @SuppressWarnings("unchecked") Operator<? extends OperatorDesc> operator = (Operator<? extends OperatorDesc>) nd;
    // root is the start of the operator pipeline we're currently
    // packing into a vertex, typically a table scan, union or join
    Operator<?> root = context.currentRootOperator;
    LOG.debug("Root operator: " + root);
    LOG.debug("Leaf operator: " + operator);
    SparkWork sparkWork = context.currentTask.getWork();
    SMBMapJoinOperator smbOp = GenSparkUtils.getChildOperator(root, SMBMapJoinOperator.class);
    // Right now the work graph is pretty simple. If there is no
    // Preceding work we have a root and will generate a map
    // vertex. If there is a preceding work we will generate
    // a reduce vertex
    BaseWork work;
    if (context.rootToWorkMap.containsKey(root)) {
        // having seen the root operator before means there was a branch in the
        // operator graph. There's typically two reasons for that: a) mux/demux
        // b) multi insert. Mux/Demux will hit the same leaf again, multi insert
        // will result into a vertex with multiple FS or RS operators.
        // At this point we don't have to do anything special in this case. Just
        // run through the regular paces w/o creating a new task.
        work = context.rootToWorkMap.get(root);
    } else {
        // create a new vertex
        if (context.preceedingWork == null) {
            if (smbOp == null) {
                work = utils.createMapWork(context, root, sparkWork, null);
            } else {
                //save work to be initialized later with SMB information.
                work = utils.createMapWork(context, root, sparkWork, null, true);
                context.smbMapJoinCtxMap.get(smbOp).mapWork = (MapWork) work;
            }
        } else {
            work = utils.createReduceWork(context, root, sparkWork);
        }
        context.rootToWorkMap.put(root, work);
    }
    if (!context.childToWorkMap.containsKey(operator)) {
        List<BaseWork> workItems = new LinkedList<BaseWork>();
        workItems.add(work);
        context.childToWorkMap.put(operator, workItems);
    } else {
        context.childToWorkMap.get(operator).add(work);
    }
    // remember which mapjoin operator links with which work
    if (!context.currentMapJoinOperators.isEmpty()) {
        for (MapJoinOperator mj : context.currentMapJoinOperators) {
            LOG.debug("Processing map join: " + mj);
            // remember the mapping in case we scan another branch of the mapjoin later
            if (!context.mapJoinWorkMap.containsKey(mj)) {
                List<BaseWork> workItems = new LinkedList<BaseWork>();
                workItems.add(work);
                context.mapJoinWorkMap.put(mj, workItems);
            } else {
                context.mapJoinWorkMap.get(mj).add(work);
            }
            /*
         * this happens in case of map join operations.
         * The tree looks like this:
         *
         *        RS <--- we are here perhaps
         *        |
         *     MapJoin
         *     /     \
         *   RS       TS
         *  /
         * TS
         *
         * If we are at the RS pointed above, and we may have already visited the
         * RS following the TS, we have already generated work for the TS-RS.
         * We need to hook the current work to this generated work.
         */
            if (context.linkOpWithWorkMap.containsKey(mj)) {
                Map<BaseWork, SparkEdgeProperty> linkWorkMap = context.linkOpWithWorkMap.get(mj);
                if (linkWorkMap != null) {
                    if (context.linkChildOpWithDummyOp.containsKey(mj)) {
                        for (Operator<?> dummy : context.linkChildOpWithDummyOp.get(mj)) {
                            work.addDummyOp((HashTableDummyOperator) dummy);
                        }
                    }
                    for (Entry<BaseWork, SparkEdgeProperty> parentWorkMap : linkWorkMap.entrySet()) {
                        BaseWork parentWork = parentWorkMap.getKey();
                        LOG.debug("connecting " + parentWork.getName() + " with " + work.getName());
                        SparkEdgeProperty edgeProp = parentWorkMap.getValue();
                        sparkWork.connect(parentWork, work, edgeProp);
                        // of the downstream work
                        for (ReduceSinkOperator r : context.linkWorkWithReduceSinkMap.get(parentWork)) {
                            if (r.getConf().getOutputName() != null) {
                                LOG.debug("Cloning reduce sink for multi-child broadcast edge");
                                // we've already set this one up. Need to clone for the next work.
                                r = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(r.getCompilationOpContext(), (ReduceSinkDesc) r.getConf().clone(), r.getParentOperators());
                            }
                            r.getConf().setOutputName(work.getName());
                        }
                    }
                }
            }
        }
        // clear out the set. we don't need it anymore.
        context.currentMapJoinOperators.clear();
    }
    // with this root operator.
    if (root.getNumParent() > 0) {
        Preconditions.checkArgument(work instanceof ReduceWork, "AssertionError: expected work to be a ReduceWork, but was " + work.getClass().getName());
        ReduceWork reduceWork = (ReduceWork) work;
        for (Operator<?> parent : new ArrayList<Operator<?>>(root.getParentOperators())) {
            Preconditions.checkArgument(parent instanceof ReduceSinkOperator, "AssertionError: expected operator to be a ReduceSinkOperator, but was " + parent.getClass().getName());
            ReduceSinkOperator rsOp = (ReduceSinkOperator) parent;
            SparkEdgeProperty edgeProp = GenSparkUtils.getEdgeProperty(rsOp, reduceWork);
            rsOp.getConf().setOutputName(reduceWork.getName());
            GenMapRedUtils.setKeyAndValueDesc(reduceWork, rsOp);
            context.leafOpToFollowingWorkInfo.put(rsOp, ObjectPair.create(edgeProp, reduceWork));
            LOG.debug("Removing " + parent + " as parent from " + root);
            root.removeParent(parent);
        }
    }
    // the union operators from the operator tree later.
    if (!context.currentUnionOperators.isEmpty()) {
        context.currentUnionOperators.clear();
        context.workWithUnionOperators.add(work);
    }
    // reasons. Roots are data sources, leaves are data sinks. I know.
    if (context.leafOpToFollowingWorkInfo.containsKey(operator)) {
        ObjectPair<SparkEdgeProperty, ReduceWork> childWorkInfo = context.leafOpToFollowingWorkInfo.get(operator);
        SparkEdgeProperty edgeProp = childWorkInfo.getFirst();
        ReduceWork childWork = childWorkInfo.getSecond();
        LOG.debug("Second pass. Leaf operator: " + operator + " has common downstream work:" + childWork);
        // we don't want to connect them with the work associated with TS more than once.
        if (sparkWork.getEdgeProperty(work, childWork) == null) {
            sparkWork.connect(work, childWork, edgeProp);
        } else {
            LOG.debug("work " + work.getName() + " is already connected to " + childWork.getName() + " before");
        }
    } else {
        LOG.debug("First pass. Leaf operator: " + operator);
    }
    // the next item will be a new root.
    if (!operator.getChildOperators().isEmpty()) {
        Preconditions.checkArgument(operator.getChildOperators().size() == 1, "AssertionError: expected operator.getChildOperators().size() to be 1, but was " + operator.getChildOperators().size());
        context.parentOfRoot = operator;
        context.currentRootOperator = operator.getChildOperators().get(0);
        context.preceedingWork = work;
    }
    return null;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) HashTableDummyOperator(org.apache.hadoop.hive.ql.exec.HashTableDummyOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) ArrayList(java.util.ArrayList) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) LinkedList(java.util.LinkedList) SparkEdgeProperty(org.apache.hadoop.hive.ql.plan.SparkEdgeProperty) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 7 with HashTableDummyOperator

use of org.apache.hadoop.hive.ql.exec.HashTableDummyOperator in project hive by apache.

the class GenTezUtils method removeUnionOperators.

// removes any union operator and clones the plan
public static void removeUnionOperators(GenTezProcContext context, BaseWork work, int indexForTezUnion) throws SemanticException {
    List<Operator<?>> roots = new ArrayList<Operator<?>>();
    roots.addAll(work.getAllRootOperators());
    if (work.getDummyOps() != null) {
        roots.addAll(work.getDummyOps());
    }
    roots.addAll(context.eventOperatorSet);
    // need to clone the plan.
    List<Operator<?>> newRoots = SerializationUtilities.cloneOperatorTree(roots, indexForTezUnion);
    // we're cloning the operator plan but we're retaining the original work. That means
    // that root operators have to be replaced with the cloned ops. The replacement map
    // tells you what that mapping is.
    BiMap<Operator<?>, Operator<?>> replacementMap = HashBiMap.create();
    // there's some special handling for dummyOps required. Mapjoins won't be properly
    // initialized if their dummy parents aren't initialized. Since we cloned the plan
    // we need to replace the dummy operators in the work with the cloned ones.
    List<HashTableDummyOperator> dummyOps = new LinkedList<HashTableDummyOperator>();
    Iterator<Operator<?>> it = newRoots.iterator();
    for (Operator<?> orig : roots) {
        Set<FileSinkOperator> fsOpSet = OperatorUtils.findOperators(orig, FileSinkOperator.class);
        for (FileSinkOperator fsOp : fsOpSet) {
            context.fileSinkSet.remove(fsOp);
        }
        Operator<?> newRoot = it.next();
        replacementMap.put(orig, newRoot);
        if (newRoot instanceof HashTableDummyOperator) {
            // dummy ops need to be updated to the cloned ones.
            dummyOps.add((HashTableDummyOperator) newRoot);
            it.remove();
        } else if (newRoot instanceof AppMasterEventOperator) {
            // need to restore the original scan.
            if (newRoot.getConf() instanceof DynamicPruningEventDesc) {
                TableScanOperator ts = ((DynamicPruningEventDesc) orig.getConf()).getTableScan();
                if (ts == null) {
                    throw new AssertionError("No table scan associated with dynamic event pruning. " + orig);
                }
                ((DynamicPruningEventDesc) newRoot.getConf()).setTableScan(ts);
            }
            it.remove();
        } else {
            if (newRoot instanceof TableScanOperator) {
                if (context.tsToEventMap.containsKey(orig)) {
                    // we need to update event operators with the cloned table scan
                    for (AppMasterEventOperator event : context.tsToEventMap.get(orig)) {
                        ((DynamicPruningEventDesc) event.getConf()).setTableScan((TableScanOperator) newRoot);
                    }
                }
                // This TableScanOperator could be part of semijoin optimization.
                Map<ReduceSinkOperator, TableScanOperator> rsOpToTsOpMap = context.parseContext.getRsOpToTsOpMap();
                for (ReduceSinkOperator rs : rsOpToTsOpMap.keySet()) {
                    if (rsOpToTsOpMap.get(rs) == orig) {
                        rsOpToTsOpMap.put(rs, (TableScanOperator) newRoot);
                    }
                }
            }
            context.rootToWorkMap.remove(orig);
            context.rootToWorkMap.put(newRoot, work);
        }
    }
    // now we remove all the unions. we throw away any branch that's not reachable from
    // the current set of roots. The reason is that those branches will be handled in
    // different tasks.
    Deque<Operator<?>> operators = new LinkedList<Operator<?>>();
    operators.addAll(newRoots);
    Set<Operator<?>> seen = new HashSet<Operator<?>>();
    while (!operators.isEmpty()) {
        Operator<?> current = operators.pop();
        seen.add(current);
        if (current instanceof FileSinkOperator) {
            FileSinkOperator fileSink = (FileSinkOperator) current;
            // remember it for additional processing later
            context.fileSinkSet.add(fileSink);
            FileSinkDesc desc = fileSink.getConf();
            Path path = desc.getDirName();
            List<FileSinkDesc> linked;
            if (!context.linkedFileSinks.containsKey(path)) {
                linked = new ArrayList<FileSinkDesc>();
                context.linkedFileSinks.put(path, linked);
            }
            linked = context.linkedFileSinks.get(path);
            linked.add(desc);
            desc.setDirName(new Path(path, "" + linked.size()));
            desc.setLinkedFileSink(true);
            desc.setParentDir(path);
            desc.setLinkedFileSinkDesc(linked);
        }
        if (current instanceof AppMasterEventOperator) {
            // remember for additional processing later
            context.eventOperatorSet.add((AppMasterEventOperator) current);
            // mark the original as abandoned. Don't need it anymore.
            context.abandonedEventOperatorSet.add((AppMasterEventOperator) replacementMap.inverse().get(current));
        }
        if (current instanceof UnionOperator) {
            Operator<?> parent = null;
            int count = 0;
            for (Operator<?> op : current.getParentOperators()) {
                if (seen.contains(op)) {
                    ++count;
                    parent = op;
                }
            }
            // we should have been able to reach the union from only one side.
            assert count <= 1;
            if (parent == null) {
                // root operator is union (can happen in reducers)
                replacementMap.put(current, current.getChildOperators().get(0));
            } else {
                parent.removeChildAndAdoptItsChildren(current);
            }
        }
        if (current instanceof FileSinkOperator || current instanceof ReduceSinkOperator) {
            current.setChildOperators(null);
        } else {
            operators.addAll(current.getChildOperators());
        }
    }
    LOG.debug("Setting dummy ops for work " + work.getName() + ": " + dummyOps);
    work.setDummyOps(dummyOps);
    work.replaceRoots(replacementMap);
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) HashTableDummyOperator(org.apache.hadoop.hive.ql.exec.HashTableDummyOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) Path(org.apache.hadoop.fs.Path) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) HashTableDummyOperator(org.apache.hadoop.hive.ql.exec.HashTableDummyOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) BiMap(com.google.common.collect.BiMap) HashBiMap(com.google.common.collect.HashBiMap)

Example 8 with HashTableDummyOperator

use of org.apache.hadoop.hive.ql.exec.HashTableDummyOperator in project hive by apache.

the class GenSparkSkewJoinProcessor method insertSHTS.

/**
   * Insert SparkHashTableSink and HashTableDummy between small dir TS and MJ.
   */
@SuppressWarnings("unchecked")
private static void insertSHTS(byte tag, TableScanOperator tableScan, MapWork bigMapWork) {
    Preconditions.checkArgument(tableScan.getChildOperators().size() == 1 && tableScan.getChildOperators().get(0) instanceof MapJoinOperator);
    HashTableDummyDesc desc = new HashTableDummyDesc();
    HashTableDummyOperator dummyOp = (HashTableDummyOperator) OperatorFactory.get(tableScan.getCompilationOpContext(), desc);
    dummyOp.getConf().setTbl(tableScan.getTableDesc());
    MapJoinOperator mapJoinOp = (MapJoinOperator) tableScan.getChildOperators().get(0);
    mapJoinOp.replaceParent(tableScan, dummyOp);
    List<Operator<? extends OperatorDesc>> mapJoinChildren = new ArrayList<Operator<? extends OperatorDesc>>();
    mapJoinChildren.add(mapJoinOp);
    dummyOp.setChildOperators(mapJoinChildren);
    bigMapWork.addDummyOp(dummyOp);
    MapJoinDesc mjDesc = mapJoinOp.getConf();
    // mapjoin should not be affected by join reordering
    mjDesc.resetOrder();
    SparkHashTableSinkDesc hashTableSinkDesc = new SparkHashTableSinkDesc(mjDesc);
    SparkHashTableSinkOperator hashTableSinkOp = (SparkHashTableSinkOperator) OperatorFactory.get(tableScan.getCompilationOpContext(), hashTableSinkDesc);
    int[] valueIndex = mjDesc.getValueIndex(tag);
    if (valueIndex != null) {
        List<ExprNodeDesc> newValues = new ArrayList<ExprNodeDesc>();
        List<ExprNodeDesc> values = hashTableSinkDesc.getExprs().get(tag);
        for (int index = 0; index < values.size(); index++) {
            if (valueIndex[index] < 0) {
                newValues.add(values.get(index));
            }
        }
        hashTableSinkDesc.getExprs().put(tag, newValues);
    }
    tableScan.replaceChild(mapJoinOp, hashTableSinkOp);
    List<Operator<? extends OperatorDesc>> tableScanParents = new ArrayList<Operator<? extends OperatorDesc>>();
    tableScanParents.add(tableScan);
    hashTableSinkOp.setParentOperators(tableScanParents);
    hashTableSinkOp.getConf().setTag(tag);
}
Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) HashTableDummyOperator(org.apache.hadoop.hive.ql.exec.HashTableDummyOperator) SparkHashTableSinkOperator(org.apache.hadoop.hive.ql.exec.SparkHashTableSinkOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) SparkHashTableSinkDesc(org.apache.hadoop.hive.ql.plan.SparkHashTableSinkDesc) HashTableDummyDesc(org.apache.hadoop.hive.ql.plan.HashTableDummyDesc) MapJoinDesc(org.apache.hadoop.hive.ql.plan.MapJoinDesc) ArrayList(java.util.ArrayList) HashTableDummyOperator(org.apache.hadoop.hive.ql.exec.HashTableDummyOperator) ExprNodeDesc(org.apache.hadoop.hive.ql.plan.ExprNodeDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) SparkHashTableSinkOperator(org.apache.hadoop.hive.ql.exec.SparkHashTableSinkOperator)

Example 9 with HashTableDummyOperator

use of org.apache.hadoop.hive.ql.exec.HashTableDummyOperator in project hive by apache.

the class MapRecordProcessor method close.

@Override
void close() {
    // check if there are IOExceptions
    if (!isAborted()) {
        setAborted(execContext.getIoCxt().getIOExceptions());
    }
    if (cache != null && cacheKeys != null) {
        for (String k : cacheKeys) {
            cache.release(k);
        }
    }
    if (dynamicValueCache != null && dynamicValueCacheKeys != null) {
        for (String k : dynamicValueCacheKeys) {
            dynamicValueCache.release(k);
        }
    }
    // detecting failed executions by exceptions thrown by the operator tree
    try {
        if (mapOp == null || mapWork == null) {
            return;
        }
        boolean abort = isAborted();
        mapOp.close(abort);
        if (mergeMapOpList.isEmpty() == false) {
            for (AbstractMapOperator mergeMapOp : mergeMapOpList) {
                mergeMapOp.close(abort);
            }
        }
        // Need to close the dummyOps as well. The operator pipeline
        // is not considered "closed/done" unless all operators are
        // done. For broadcast joins that includes the dummy parents.
        List<HashTableDummyOperator> dummyOps = mapWork.getDummyOps();
        if (dummyOps != null) {
            for (Operator<? extends OperatorDesc> dummyOp : dummyOps) {
                dummyOp.close(abort);
            }
        }
        ReportStats rps = new ReportStats(reporter, jconf);
        mapOp.preorderMap(rps);
        return;
    } catch (Exception e) {
        if (!isAborted()) {
            // signal new failure to map-reduce
            l4j.error("Hit error while closing operators - failing tree");
            throw new RuntimeException("Hive Runtime Error while closing operators", e);
        }
    } finally {
        Utilities.clearWorkMap(jconf);
        MapredContext.close();
    }
}
Also used : AbstractMapOperator(org.apache.hadoop.hive.ql.exec.AbstractMapOperator) ReportStats(org.apache.hadoop.hive.ql.exec.mr.ExecMapper.ReportStats) HashTableDummyOperator(org.apache.hadoop.hive.ql.exec.HashTableDummyOperator) IOException(java.io.IOException)

Example 10 with HashTableDummyOperator

use of org.apache.hadoop.hive.ql.exec.HashTableDummyOperator in project hive by apache.

the class MapRecordProcessor method init.

@Override
void init(MRTaskReporter mrReporter, Map<String, LogicalInput> inputs, Map<String, LogicalOutput> outputs) throws Exception {
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
    super.init(mrReporter, inputs, outputs);
    checkAbortCondition();
    String key = processorContext.getTaskVertexName() + MAP_PLAN_KEY;
    cacheKeys.add(key);
    // create map and fetch operators
    mapWork = (MapWork) cache.retrieve(key, new Callable<Object>() {

        @Override
        public Object call() {
            return Utilities.getMapWork(jconf);
        }
    });
    // TODO HIVE-14042. Cleanup may be required if exiting early.
    Utilities.setMapWork(jconf, mapWork);
    String prefixes = jconf.get(DagUtils.TEZ_MERGE_WORK_FILE_PREFIXES);
    if (prefixes != null) {
        mergeWorkList = new ArrayList<MapWork>();
        for (final String prefix : prefixes.split(",")) {
            if (prefix == null || prefix.isEmpty()) {
                continue;
            }
            key = processorContext.getTaskVertexName() + prefix;
            cacheKeys.add(key);
            checkAbortCondition();
            mergeWorkList.add((MapWork) cache.retrieve(key, new Callable<Object>() {

                @Override
                public Object call() {
                    return Utilities.getMergeWork(jconf, prefix);
                }
            }));
        }
    }
    MapredContext.init(true, new JobConf(jconf));
    ((TezContext) MapredContext.get()).setInputs(inputs);
    ((TezContext) MapredContext.get()).setTezProcessorContext(processorContext);
    // Update JobConf using MRInput, info like filename comes via this
    checkAbortCondition();
    legacyMRInput = getMRInput(inputs);
    if (legacyMRInput != null) {
        Configuration updatedConf = legacyMRInput.getConfigUpdates();
        if (updatedConf != null) {
            for (Entry<String, String> entry : updatedConf) {
                jconf.set(entry.getKey(), entry.getValue());
            }
        }
    }
    checkAbortCondition();
    createOutputMap();
    // Start all the Outputs.
    for (Entry<String, LogicalOutput> outputEntry : outputs.entrySet()) {
        l4j.debug("Starting Output: " + outputEntry.getKey());
        outputEntry.getValue().start();
        ((TezKVOutputCollector) outMap.get(outputEntry.getKey())).initialize();
    }
    checkAbortCondition();
    try {
        CompilationOpContext runtimeCtx = new CompilationOpContext();
        if (mapWork.getVectorMode()) {
            mapOp = new VectorMapOperator(runtimeCtx);
        } else {
            mapOp = new MapOperator(runtimeCtx);
        }
        // Not synchronizing creation of mapOp with an invocation. Check immediately
        // after creation in case abort has been set.
        // Relying on the regular flow to clean up the actual operator. i.e. If an exception is
        // thrown, an attempt will be made to cleanup the op.
        // If we are here - exit out via an exception. If we're in the middle of the opeartor.initialize
        // call further down, we rely upon op.abort().
        checkAbortCondition();
        mapOp.clearConnectedOperators();
        mapOp.setExecContext(execContext);
        boolean fromCache = false;
        if (mergeWorkList != null) {
            AbstractMapOperator mergeMapOp = null;
            for (BaseWork mergeWork : mergeWorkList) {
                // TODO HIVE-14042. What is mergeWork, and why is it not part of the regular operator chain.
                // The mergeMapOp.initialize call further down can block, and will not receive information
                // about an abort request.
                MapWork mergeMapWork = (MapWork) mergeWork;
                if (mergeMapWork.getVectorMode()) {
                    mergeMapOp = new VectorMapOperator(runtimeCtx);
                } else {
                    mergeMapOp = new MapOperator(runtimeCtx);
                }
                mergeMapOpList.add(mergeMapOp);
                // initialize the merge operators first.
                if (mergeMapOp != null) {
                    mergeMapOp.setConf(mergeMapWork);
                    l4j.info("Input name is " + mergeMapWork.getName());
                    jconf.set(Utilities.INPUT_NAME, mergeMapWork.getName());
                    mergeMapOp.initialize(jconf, null);
                    // if there are no files/partitions to read, we need to skip trying to read
                    MultiMRInput multiMRInput = multiMRInputMap.get(mergeMapWork.getName());
                    boolean skipRead = false;
                    if (multiMRInput == null) {
                        l4j.info("Multi MR Input for work " + mergeMapWork.getName() + " is null. Skipping read.");
                        skipRead = true;
                    } else {
                        Collection<KeyValueReader> keyValueReaders = multiMRInput.getKeyValueReaders();
                        if ((keyValueReaders == null) || (keyValueReaders.isEmpty())) {
                            l4j.info("Key value readers are null or empty and hence skipping read. " + "KeyValueReaders = " + keyValueReaders);
                            skipRead = true;
                        }
                    }
                    if (skipRead) {
                        List<Operator<?>> children = new ArrayList<Operator<?>>();
                        children.addAll(mergeMapOp.getConf().getAliasToWork().values());
                        // do the same thing as setChildren when there is nothing to read.
                        // the setChildren method initializes the object inspector needed by the operators
                        // based on path and partition information which we don't have in this case.
                        mergeMapOp.initEmptyInputChildren(children, jconf);
                    } else {
                        // the setChildren method initializes the object inspector needed by the operators
                        // based on path and partition information.
                        mergeMapOp.setChildren(jconf);
                    }
                    Operator<? extends OperatorDesc> finalOp = getFinalOp(mergeMapOp);
                    if (finalOp instanceof TezDummyStoreOperator) {
                        // we ensure that we don't try to read any data in case of skip read.
                        ((TezDummyStoreOperator) finalOp).setFetchDone(skipRead);
                        mapOp.setConnectedOperators(mergeMapWork.getTag(), (DummyStoreOperator) finalOp);
                    } else {
                        // found the plan is already connected which means this is derived from the cache.
                        fromCache = true;
                    }
                    mergeMapOp.passExecContext(new ExecMapperContext(jconf));
                    mergeMapOp.initializeLocalWork(jconf);
                }
            }
        }
        if (!fromCache) {
            // if not from cache, we still need to hook up the plans.
            ((TezContext) (MapredContext.get())).setDummyOpsMap(mapOp.getConnectedOperators());
        }
        // initialize map operator
        mapOp.setConf(mapWork);
        l4j.info("Main input name is " + mapWork.getName());
        jconf.set(Utilities.INPUT_NAME, mapWork.getName());
        mapOp.initialize(jconf, null);
        checkAbortCondition();
        mapOp.setChildren(jconf);
        mapOp.passExecContext(execContext);
        l4j.info(mapOp.dump(0));
        // set memory available for operators
        long memoryAvailableToTask = processorContext.getTotalMemoryAvailableToTask();
        if (mapOp.getConf() != null) {
            mapOp.getConf().setMaxMemoryAvailable(memoryAvailableToTask);
            l4j.info("Memory available for operators set to {}", LlapUtil.humanReadableByteCount(memoryAvailableToTask));
        }
        OperatorUtils.setMemoryAvailable(mapOp.getChildOperators(), memoryAvailableToTask);
        mapOp.initializeLocalWork(jconf);
        // Setup values registry
        checkAbortCondition();
        String valueRegistryKey = DynamicValue.DYNAMIC_VALUE_REGISTRY_CACHE_KEY;
        // On LLAP dynamic value registry might already be cached.
        final DynamicValueRegistryTez registryTez = dynamicValueCache.retrieve(valueRegistryKey, new Callable<DynamicValueRegistryTez>() {

            @Override
            public DynamicValueRegistryTez call() {
                return new DynamicValueRegistryTez();
            }
        });
        dynamicValueCacheKeys.add(valueRegistryKey);
        RegistryConfTez registryConf = new RegistryConfTez(jconf, mapWork, processorContext, inputs);
        registryTez.init(registryConf);
        checkAbortCondition();
        initializeMapRecordSources();
        mapOp.initializeMapOperator(jconf);
        if ((mergeMapOpList != null) && mergeMapOpList.isEmpty() == false) {
            for (AbstractMapOperator mergeMapOp : mergeMapOpList) {
                jconf.set(Utilities.INPUT_NAME, mergeMapOp.getConf().getName());
                // TODO HIVE-14042. abort handling: Handling of mergeMapOp
                mergeMapOp.initializeMapOperator(jconf);
            }
        }
        // Initialization isn't finished until all parents of all operators
        // are initialized. For broadcast joins that means initializing the
        // dummy parent operators as well.
        List<HashTableDummyOperator> dummyOps = mapWork.getDummyOps();
        jconf.set(Utilities.INPUT_NAME, mapWork.getName());
        if (dummyOps != null) {
            for (Operator<? extends OperatorDesc> dummyOp : dummyOps) {
                dummyOp.setExecContext(execContext);
                // TODO HIVE-14042. Handling of dummyOps, and propagating abort information to them
                dummyOp.initialize(jconf, null);
            }
        }
        OperatorUtils.setChildrenCollector(mapOp.getChildOperators(), outMap);
        mapOp.setReporter(reporter);
        MapredContext.get().setReporter(reporter);
    } catch (Throwable e) {
        setAborted(true);
        if (e instanceof OutOfMemoryError) {
            // Don't create a new object if we are already out of memory
            throw (OutOfMemoryError) e;
        } else if (e instanceof InterruptedException) {
            l4j.info("Hit an interrupt while initializing MapRecordProcessor. Message={}", e.getMessage());
            throw (InterruptedException) e;
        } else {
            throw new RuntimeException("Map operator initialization failed", e);
        }
    }
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.TEZ_INIT_OPERATORS);
}
Also used : AbstractMapOperator(org.apache.hadoop.hive.ql.exec.AbstractMapOperator) HashTableDummyOperator(org.apache.hadoop.hive.ql.exec.HashTableDummyOperator) TezDummyStoreOperator(org.apache.hadoop.hive.ql.exec.TezDummyStoreOperator) MapOperator(org.apache.hadoop.hive.ql.exec.MapOperator) VectorMapOperator(org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) AbstractMapOperator(org.apache.hadoop.hive.ql.exec.AbstractMapOperator) Configuration(org.apache.hadoop.conf.Configuration) KeyValueReader(org.apache.tez.runtime.library.api.KeyValueReader) ArrayList(java.util.ArrayList) VectorMapOperator(org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator) TezDummyStoreOperator(org.apache.hadoop.hive.ql.exec.TezDummyStoreOperator) MapOperator(org.apache.hadoop.hive.ql.exec.MapOperator) VectorMapOperator(org.apache.hadoop.hive.ql.exec.vector.VectorMapOperator) AbstractMapOperator(org.apache.hadoop.hive.ql.exec.AbstractMapOperator) JobConf(org.apache.hadoop.mapred.JobConf) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork) MultiMRInput(org.apache.tez.mapreduce.input.MultiMRInput) ExecMapperContext(org.apache.hadoop.hive.ql.exec.mr.ExecMapperContext) LogicalOutput(org.apache.tez.runtime.api.LogicalOutput) HashTableDummyOperator(org.apache.hadoop.hive.ql.exec.HashTableDummyOperator) TezKVOutputCollector(org.apache.hadoop.hive.ql.exec.tez.TezProcessor.TezKVOutputCollector) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) RegistryConfTez(org.apache.hadoop.hive.ql.exec.tez.DynamicValueRegistryTez.RegistryConfTez)

Aggregations

HashTableDummyOperator (org.apache.hadoop.hive.ql.exec.HashTableDummyOperator)10 Operator (org.apache.hadoop.hive.ql.exec.Operator)8 BaseWork (org.apache.hadoop.hive.ql.plan.BaseWork)7 ArrayList (java.util.ArrayList)6 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)6 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)5 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)4 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)4 LinkedList (java.util.LinkedList)3 DummyStoreOperator (org.apache.hadoop.hive.ql.exec.DummyStoreOperator)3 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)3 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)3 HashTableDummyDesc (org.apache.hadoop.hive.ql.plan.HashTableDummyDesc)3 MapJoinDesc (org.apache.hadoop.hive.ql.plan.MapJoinDesc)3 List (java.util.List)2 AbstractMapOperator (org.apache.hadoop.hive.ql.exec.AbstractMapOperator)2 SparkHashTableSinkOperator (org.apache.hadoop.hive.ql.exec.SparkHashTableSinkOperator)2 ReportStats (org.apache.hadoop.hive.ql.exec.mr.ExecMapper.ReportStats)2 RegistryConfTez (org.apache.hadoop.hive.ql.exec.tez.DynamicValueRegistryTez.RegistryConfTez)2 SemanticException (org.apache.hadoop.hive.ql.parse.SemanticException)2