Search in sources :

Example 86 with Operator

use of org.apache.hadoop.hive.ql.exec.Operator in project hive by apache.

the class GenSparkWork method process.

@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
    GenSparkProcContext context = (GenSparkProcContext) procContext;
    Preconditions.checkArgument(context != null, "AssertionError: expected context to be not null");
    Preconditions.checkArgument(context.currentTask != null, "AssertionError: expected context.currentTask to be not null");
    Preconditions.checkArgument(context.currentRootOperator != null, "AssertionError: expected context.currentRootOperator to be not null");
    // Operator is a file sink or reduce sink. Something that forces a new vertex.
    @SuppressWarnings("unchecked") Operator<? extends OperatorDesc> operator = (Operator<? extends OperatorDesc>) nd;
    // root is the start of the operator pipeline we're currently
    // packing into a vertex, typically a table scan, union or join
    Operator<?> root = context.currentRootOperator;
    LOG.debug("Root operator: " + root);
    LOG.debug("Leaf operator: " + operator);
    SparkWork sparkWork = context.currentTask.getWork();
    SMBMapJoinOperator smbOp = GenSparkUtils.getChildOperator(root, SMBMapJoinOperator.class);
    // Right now the work graph is pretty simple. If there is no
    // Preceding work we have a root and will generate a map
    // vertex. If there is a preceding work we will generate
    // a reduce vertex
    BaseWork work;
    if (context.rootToWorkMap.containsKey(root)) {
        // having seen the root operator before means there was a branch in the
        // operator graph. There's typically two reasons for that: a) mux/demux
        // b) multi insert. Mux/Demux will hit the same leaf again, multi insert
        // will result into a vertex with multiple FS or RS operators.
        // At this point we don't have to do anything special in this case. Just
        // run through the regular paces w/o creating a new task.
        work = context.rootToWorkMap.get(root);
    } else {
        // create a new vertex
        if (context.preceedingWork == null) {
            if (smbOp == null) {
                work = utils.createMapWork(context, root, sparkWork, null);
            } else {
                //save work to be initialized later with SMB information.
                work = utils.createMapWork(context, root, sparkWork, null, true);
                context.smbMapJoinCtxMap.get(smbOp).mapWork = (MapWork) work;
            }
        } else {
            work = utils.createReduceWork(context, root, sparkWork);
        }
        context.rootToWorkMap.put(root, work);
    }
    if (!context.childToWorkMap.containsKey(operator)) {
        List<BaseWork> workItems = new LinkedList<BaseWork>();
        workItems.add(work);
        context.childToWorkMap.put(operator, workItems);
    } else {
        context.childToWorkMap.get(operator).add(work);
    }
    // remember which mapjoin operator links with which work
    if (!context.currentMapJoinOperators.isEmpty()) {
        for (MapJoinOperator mj : context.currentMapJoinOperators) {
            LOG.debug("Processing map join: " + mj);
            // remember the mapping in case we scan another branch of the mapjoin later
            if (!context.mapJoinWorkMap.containsKey(mj)) {
                List<BaseWork> workItems = new LinkedList<BaseWork>();
                workItems.add(work);
                context.mapJoinWorkMap.put(mj, workItems);
            } else {
                context.mapJoinWorkMap.get(mj).add(work);
            }
            /*
         * this happens in case of map join operations.
         * The tree looks like this:
         *
         *        RS <--- we are here perhaps
         *        |
         *     MapJoin
         *     /     \
         *   RS       TS
         *  /
         * TS
         *
         * If we are at the RS pointed above, and we may have already visited the
         * RS following the TS, we have already generated work for the TS-RS.
         * We need to hook the current work to this generated work.
         */
            if (context.linkOpWithWorkMap.containsKey(mj)) {
                Map<BaseWork, SparkEdgeProperty> linkWorkMap = context.linkOpWithWorkMap.get(mj);
                if (linkWorkMap != null) {
                    if (context.linkChildOpWithDummyOp.containsKey(mj)) {
                        for (Operator<?> dummy : context.linkChildOpWithDummyOp.get(mj)) {
                            work.addDummyOp((HashTableDummyOperator) dummy);
                        }
                    }
                    for (Entry<BaseWork, SparkEdgeProperty> parentWorkMap : linkWorkMap.entrySet()) {
                        BaseWork parentWork = parentWorkMap.getKey();
                        LOG.debug("connecting " + parentWork.getName() + " with " + work.getName());
                        SparkEdgeProperty edgeProp = parentWorkMap.getValue();
                        sparkWork.connect(parentWork, work, edgeProp);
                        // of the downstream work
                        for (ReduceSinkOperator r : context.linkWorkWithReduceSinkMap.get(parentWork)) {
                            if (r.getConf().getOutputName() != null) {
                                LOG.debug("Cloning reduce sink for multi-child broadcast edge");
                                // we've already set this one up. Need to clone for the next work.
                                r = (ReduceSinkOperator) OperatorFactory.getAndMakeChild(r.getCompilationOpContext(), (ReduceSinkDesc) r.getConf().clone(), r.getParentOperators());
                            }
                            r.getConf().setOutputName(work.getName());
                        }
                    }
                }
            }
        }
        // clear out the set. we don't need it anymore.
        context.currentMapJoinOperators.clear();
    }
    // with this root operator.
    if (root.getNumParent() > 0) {
        Preconditions.checkArgument(work instanceof ReduceWork, "AssertionError: expected work to be a ReduceWork, but was " + work.getClass().getName());
        ReduceWork reduceWork = (ReduceWork) work;
        for (Operator<?> parent : new ArrayList<Operator<?>>(root.getParentOperators())) {
            Preconditions.checkArgument(parent instanceof ReduceSinkOperator, "AssertionError: expected operator to be a ReduceSinkOperator, but was " + parent.getClass().getName());
            ReduceSinkOperator rsOp = (ReduceSinkOperator) parent;
            SparkEdgeProperty edgeProp = GenSparkUtils.getEdgeProperty(rsOp, reduceWork);
            rsOp.getConf().setOutputName(reduceWork.getName());
            GenMapRedUtils.setKeyAndValueDesc(reduceWork, rsOp);
            context.leafOpToFollowingWorkInfo.put(rsOp, ObjectPair.create(edgeProp, reduceWork));
            LOG.debug("Removing " + parent + " as parent from " + root);
            root.removeParent(parent);
        }
    }
    // the union operators from the operator tree later.
    if (!context.currentUnionOperators.isEmpty()) {
        context.currentUnionOperators.clear();
        context.workWithUnionOperators.add(work);
    }
    // reasons. Roots are data sources, leaves are data sinks. I know.
    if (context.leafOpToFollowingWorkInfo.containsKey(operator)) {
        ObjectPair<SparkEdgeProperty, ReduceWork> childWorkInfo = context.leafOpToFollowingWorkInfo.get(operator);
        SparkEdgeProperty edgeProp = childWorkInfo.getFirst();
        ReduceWork childWork = childWorkInfo.getSecond();
        LOG.debug("Second pass. Leaf operator: " + operator + " has common downstream work:" + childWork);
        // we don't want to connect them with the work associated with TS more than once.
        if (sparkWork.getEdgeProperty(work, childWork) == null) {
            sparkWork.connect(work, childWork, edgeProp);
        } else {
            LOG.debug("work " + work.getName() + " is already connected to " + childWork.getName() + " before");
        }
    } else {
        LOG.debug("First pass. Leaf operator: " + operator);
    }
    // the next item will be a new root.
    if (!operator.getChildOperators().isEmpty()) {
        Preconditions.checkArgument(operator.getChildOperators().size() == 1, "AssertionError: expected operator.getChildOperators().size() to be 1, but was " + operator.getChildOperators().size());
        context.parentOfRoot = operator;
        context.currentRootOperator = operator.getChildOperators().get(0);
        context.preceedingWork = work;
    }
    return null;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) HashTableDummyOperator(org.apache.hadoop.hive.ql.exec.HashTableDummyOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) ArrayList(java.util.ArrayList) SparkWork(org.apache.hadoop.hive.ql.plan.SparkWork) SMBMapJoinOperator(org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) LinkedList(java.util.LinkedList) SparkEdgeProperty(org.apache.hadoop.hive.ql.plan.SparkEdgeProperty) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 87 with Operator

use of org.apache.hadoop.hive.ql.exec.Operator in project hive by apache.

the class SetSparkReducerParallelism method process.

@Override
public Object process(Node nd, Stack<Node> stack, NodeProcessorCtx procContext, Object... nodeOutputs) throws SemanticException {
    OptimizeSparkProcContext context = (OptimizeSparkProcContext) procContext;
    ReduceSinkOperator sink = (ReduceSinkOperator) nd;
    ReduceSinkDesc desc = sink.getConf();
    Set<ReduceSinkOperator> parentSinks = null;
    int maxReducers = context.getConf().getIntVar(HiveConf.ConfVars.MAXREDUCERS);
    int constantReducers = context.getConf().getIntVar(HiveConf.ConfVars.HADOOPNUMREDUCERS);
    if (!useOpStats) {
        parentSinks = OperatorUtils.findOperatorsUpstream(sink, ReduceSinkOperator.class);
        parentSinks.remove(sink);
        if (!context.getVisitedReduceSinks().containsAll(parentSinks)) {
            // We haven't processed all the parent sinks, and we need
            // them to be done in order to compute the parallelism for this sink.
            // In this case, skip. We should visit this again from another path.
            LOG.debug("Skipping sink " + sink + " for now as we haven't seen all its parents.");
            return false;
        }
    }
    if (context.getVisitedReduceSinks().contains(sink)) {
        // skip walking the children
        LOG.debug("Already processed reduce sink: " + sink.getName());
        return true;
    }
    context.getVisitedReduceSinks().add(sink);
    if (needSetParallelism(sink, context.getConf())) {
        if (constantReducers > 0) {
            LOG.info("Parallelism for reduce sink " + sink + " set by user to " + constantReducers);
            desc.setNumReducers(constantReducers);
        } else {
            //If it's a FileSink to bucketed files, use the bucket count as the reducer number
            FileSinkOperator fso = GenSparkUtils.getChildOperator(sink, FileSinkOperator.class);
            if (fso != null) {
                String bucketCount = fso.getConf().getTableInfo().getProperties().getProperty(hive_metastoreConstants.BUCKET_COUNT);
                int numBuckets = bucketCount == null ? 0 : Integer.parseInt(bucketCount);
                if (numBuckets > 0) {
                    LOG.info("Set parallelism for reduce sink " + sink + " to: " + numBuckets + " (buckets)");
                    desc.setNumReducers(numBuckets);
                    return false;
                }
            }
            long numberOfBytes = 0;
            if (useOpStats) {
                // we need to add up all the estimates from the siblings of this reduce sink
                for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
                    if (sibling.getStatistics() != null) {
                        numberOfBytes += sibling.getStatistics().getDataSize();
                        if (LOG.isDebugEnabled()) {
                            LOG.debug("Sibling " + sibling + " has stats: " + sibling.getStatistics());
                        }
                    } else {
                        LOG.warn("No stats available from: " + sibling);
                    }
                }
            } else if (parentSinks.isEmpty()) {
                // we should use TS stats to infer parallelism
                for (Operator<? extends OperatorDesc> sibling : sink.getChildOperators().get(0).getParentOperators()) {
                    Set<TableScanOperator> sources = OperatorUtils.findOperatorsUpstream(sibling, TableScanOperator.class);
                    for (TableScanOperator source : sources) {
                        if (source.getStatistics() != null) {
                            numberOfBytes += source.getStatistics().getDataSize();
                            if (LOG.isDebugEnabled()) {
                                LOG.debug("Table source " + source + " has stats: " + source.getStatistics());
                            }
                        } else {
                            LOG.warn("No stats available from table source: " + source);
                        }
                    }
                }
                LOG.debug("Gathered stats for sink " + sink + ". Total size is " + numberOfBytes + " bytes.");
            } else {
                // Use the maximum parallelism from all parent reduce sinks
                int numberOfReducers = 0;
                for (ReduceSinkOperator parent : parentSinks) {
                    numberOfReducers = Math.max(numberOfReducers, parent.getConf().getNumReducers());
                }
                desc.setNumReducers(numberOfReducers);
                LOG.debug("Set parallelism for sink " + sink + " to " + numberOfReducers + " based on its parents");
                return false;
            }
            // Divide it by 2 so that we can have more reducers
            long bytesPerReducer = context.getConf().getLongVar(HiveConf.ConfVars.BYTESPERREDUCER) / 2;
            int numReducers = Utilities.estimateReducers(numberOfBytes, bytesPerReducer, maxReducers, false);
            getSparkMemoryAndCores(context);
            if (sparkMemoryAndCores != null && sparkMemoryAndCores.getFirst() > 0 && sparkMemoryAndCores.getSecond() > 0) {
                // warn the user if bytes per reducer is much larger than memory per task
                if ((double) sparkMemoryAndCores.getFirst() / bytesPerReducer < 0.5) {
                    LOG.warn("Average load of a reducer is much larger than its available memory. " + "Consider decreasing hive.exec.reducers.bytes.per.reducer");
                }
                // If there are more cores, use the number of cores
                numReducers = Math.max(numReducers, sparkMemoryAndCores.getSecond());
            }
            numReducers = Math.min(numReducers, maxReducers);
            LOG.info("Set parallelism for reduce sink " + sink + " to: " + numReducers + " (calculated)");
            desc.setNumReducers(numReducers);
        }
    } else {
        LOG.info("Number of reducers for sink " + sink + " was already determined to be: " + desc.getNumReducers());
    }
    return false;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) LimitOperator(org.apache.hadoop.hive.ql.exec.LimitOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Set(java.util.Set) FileSinkOperator(org.apache.hadoop.hive.ql.exec.FileSinkOperator) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) OptimizeSparkProcContext(org.apache.hadoop.hive.ql.parse.spark.OptimizeSparkProcContext) ReduceSinkDesc(org.apache.hadoop.hive.ql.plan.ReduceSinkDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 88 with Operator

use of org.apache.hadoop.hive.ql.exec.Operator in project hive by apache.

the class SparkSMBJoinHintOptimizer method removeSmallTableReduceSink.

/**
   * In bucket mapjoin, there are ReduceSinks that mark a small table parent (Reduce Sink are removed from big-table).
   * In SMB join these are not expected for any parents, either from small or big tables.
   * @param mapJoinOp
   */
@SuppressWarnings("unchecked")
private void removeSmallTableReduceSink(MapJoinOperator mapJoinOp) {
    SMBJoinDesc smbJoinDesc = new SMBJoinDesc(mapJoinOp.getConf());
    List<Operator<? extends OperatorDesc>> parentOperators = mapJoinOp.getParentOperators();
    for (int i = 0; i < parentOperators.size(); i++) {
        Operator<? extends OperatorDesc> par = parentOperators.get(i);
        if (i != smbJoinDesc.getPosBigTable()) {
            if (par instanceof ReduceSinkOperator) {
                List<Operator<? extends OperatorDesc>> grandParents = par.getParentOperators();
                Preconditions.checkArgument(grandParents.size() == 1, "AssertionError: expect # of parents to be 1, but was " + grandParents.size());
                Operator<? extends OperatorDesc> grandParent = grandParents.get(0);
                grandParent.removeChild(par);
                grandParent.setChildOperators(Utilities.makeList(mapJoinOp));
                mapJoinOp.getParentOperators().set(i, grandParent);
            }
        }
    }
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) SMBJoinDesc(org.apache.hadoop.hive.ql.plan.SMBJoinDesc) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 89 with Operator

use of org.apache.hadoop.hive.ql.exec.Operator in project hive by apache.

the class SplitSparkWorkResolver method setStatistics.

// we lost statistics & opTraits through cloning, try to get them back
private void setStatistics(BaseWork origin, BaseWork clone) {
    if (origin instanceof MapWork && clone instanceof MapWork) {
        MapWork originMW = (MapWork) origin;
        MapWork cloneMW = (MapWork) clone;
        for (Map.Entry<String, Operator<? extends OperatorDesc>> entry : originMW.getAliasToWork().entrySet()) {
            String alias = entry.getKey();
            Operator<? extends OperatorDesc> cloneOP = cloneMW.getAliasToWork().get(alias);
            if (cloneOP != null) {
                setStatistics(entry.getValue(), cloneOP);
            }
        }
    } else if (origin instanceof ReduceWork && clone instanceof ReduceWork) {
        setStatistics(((ReduceWork) origin).getReducer(), ((ReduceWork) clone).getReducer());
    }
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) MapWork(org.apache.hadoop.hive.ql.plan.MapWork) ReduceWork(org.apache.hadoop.hive.ql.plan.ReduceWork) Map(java.util.Map) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Example 90 with Operator

use of org.apache.hadoop.hive.ql.exec.Operator in project hive by apache.

the class HashTableLoader method loadDirectly.

private void loadDirectly(MapJoinTableContainer[] mapJoinTables, String inputFileName) throws Exception {
    MapredLocalWork localWork = context.getLocalWork();
    List<Operator<?>> directWorks = localWork.getDirectFetchOp().get(joinOp);
    if (directWorks == null || directWorks.isEmpty()) {
        return;
    }
    JobConf job = new JobConf(hconf);
    MapredLocalTask localTask = new MapredLocalTask(localWork, job, false);
    HashTableSinkOperator sink = new TemporaryHashSinkOperator(new CompilationOpContext(), desc);
    sink.setParentOperators(new ArrayList<Operator<? extends OperatorDesc>>(directWorks));
    for (Operator<?> operator : directWorks) {
        if (operator != null) {
            operator.setChildOperators(Arrays.<Operator<? extends OperatorDesc>>asList(sink));
        }
    }
    localTask.setExecContext(context);
    localTask.startForward(inputFileName);
    MapJoinTableContainer[] tables = sink.getMapJoinTables();
    for (int i = 0; i < sink.getNumParent(); i++) {
        if (sink.getParentOperators().get(i) != null) {
            mapJoinTables[i] = tables[i];
        }
    }
    Arrays.fill(tables, null);
}
Also used : MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) HashTableSinkOperator(org.apache.hadoop.hive.ql.exec.HashTableSinkOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) TemporaryHashSinkOperator(org.apache.hadoop.hive.ql.exec.TemporaryHashSinkOperator) HashTableSinkOperator(org.apache.hadoop.hive.ql.exec.HashTableSinkOperator) TemporaryHashSinkOperator(org.apache.hadoop.hive.ql.exec.TemporaryHashSinkOperator) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) MapredLocalWork(org.apache.hadoop.hive.ql.plan.MapredLocalWork) MapJoinTableContainer(org.apache.hadoop.hive.ql.exec.persistence.MapJoinTableContainer) JobConf(org.apache.hadoop.mapred.JobConf) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc)

Aggregations

Operator (org.apache.hadoop.hive.ql.exec.Operator)130 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)98 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)91 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)77 ArrayList (java.util.ArrayList)76 FileSinkOperator (org.apache.hadoop.hive.ql.exec.FileSinkOperator)75 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)65 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)62 SMBMapJoinOperator (org.apache.hadoop.hive.ql.exec.SMBMapJoinOperator)61 GroupByOperator (org.apache.hadoop.hive.ql.exec.GroupByOperator)57 SelectOperator (org.apache.hadoop.hive.ql.exec.SelectOperator)56 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)54 AbstractMapJoinOperator (org.apache.hadoop.hive.ql.exec.AbstractMapJoinOperator)45 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)40 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)39 HashMap (java.util.HashMap)36 RowSchema (org.apache.hadoop.hive.ql.exec.RowSchema)36 LinkedHashMap (java.util.LinkedHashMap)35 ColumnInfo (org.apache.hadoop.hive.ql.exec.ColumnInfo)28 List (java.util.List)22