Search in sources :

Example 21 with CompilationOpContext

use of org.apache.hadoop.hive.ql.CompilationOpContext in project hive by apache.

the class QueryPlanTreeTransformation method applyCorrelation.

/**
   * Based on the correlation, we transform the query plan tree (operator tree).
   * In here, we first create DemuxOperator and all bottom ReduceSinkOperators
   * (bottom means near TableScanOperaotr) in the correlation will be be
   * the parents of the DemuxOperaotr. We also reassign tags to those
   * ReduceSinkOperators. Then, we use MuxOperators to replace ReduceSinkOperators
   * which are not bottom ones in this correlation.
   * Example: The original operator tree is ...
   *      JOIN2
   *      /    \
   *     RS4   RS5
   *    /        \
   *   GBY1     JOIN1
   *    |       /    \
   *   RS1     RS2   RS3
   * If GBY1, JOIN1, and JOIN2 can be executed in the same reducer
   * (optimized by Correlation Optimizer).
   * The new operator tree will be ...
   *      JOIN2
   *        |
   *       MUX
   *      /   \
   *    GBY1  JOIN1
   *      \    /
   *       DEMUX
   *      /  |  \
   *     /   |   \
   *    /    |    \
   *   RS1   RS2   RS3
   * @param pCtx
   * @param corrCtx
   * @param correlation
   * @throws SemanticException
   */
protected static void applyCorrelation(ParseContext pCtx, CorrelationNodeProcCtx corrCtx, IntraQueryCorrelation correlation) throws SemanticException {
    final List<ReduceSinkOperator> bottomReduceSinkOperators = correlation.getBottomReduceSinkOperators();
    final int numReducers = correlation.getNumReducers();
    List<Operator<? extends OperatorDesc>> childrenOfDemux = new ArrayList<Operator<? extends OperatorDesc>>();
    List<Operator<? extends OperatorDesc>> parentRSsOfDemux = new ArrayList<Operator<? extends OperatorDesc>>();
    Map<Integer, Integer> childIndexToOriginalNumParents = new HashMap<Integer, Integer>();
    List<TableDesc> keysSerializeInfos = new ArrayList<TableDesc>();
    List<TableDesc> valuessSerializeInfos = new ArrayList<TableDesc>();
    Map<ReduceSinkOperator, Integer> bottomRSToNewTag = new HashMap<ReduceSinkOperator, Integer>();
    int newTag = 0;
    CompilationOpContext opCtx = null;
    for (ReduceSinkOperator rsop : bottomReduceSinkOperators) {
        if (opCtx == null) {
            opCtx = rsop.getCompilationOpContext();
        }
        rsop.getConf().setNumReducers(numReducers);
        bottomRSToNewTag.put(rsop, newTag);
        parentRSsOfDemux.add(rsop);
        keysSerializeInfos.add(rsop.getConf().getKeySerializeInfo());
        valuessSerializeInfos.add(rsop.getConf().getValueSerializeInfo());
        Operator<? extends OperatorDesc> child = CorrelationUtilities.getSingleChild(rsop, true);
        if (!childrenOfDemux.contains(child)) {
            childrenOfDemux.add(child);
            int childIndex = childrenOfDemux.size() - 1;
            childIndexToOriginalNumParents.put(childIndex, child.getNumParent());
        }
        newTag++;
    }
    for (ReduceSinkOperator rsop : bottomReduceSinkOperators) {
        setNewTag(correlation, childrenOfDemux, rsop, bottomRSToNewTag);
    }
    // Create the DemuxOperaotr
    DemuxDesc demuxDesc = new DemuxDesc(correlation.getNewTagToOldTag(), correlation.getNewTagToChildIndex(), childIndexToOriginalNumParents, keysSerializeInfos, valuessSerializeInfos);
    Operator<? extends OperatorDesc> demuxOp = OperatorFactory.get(opCtx, demuxDesc);
    demuxOp.setChildOperators(childrenOfDemux);
    demuxOp.setParentOperators(parentRSsOfDemux);
    for (Operator<? extends OperatorDesc> child : childrenOfDemux) {
        List<Operator<? extends OperatorDesc>> parentsWithMultipleDemux = new ArrayList<Operator<? extends OperatorDesc>>();
        boolean hasBottomReduceSinkOperators = false;
        boolean hasNonBottomReduceSinkOperators = false;
        for (int i = 0; i < child.getParentOperators().size(); i++) {
            Operator<? extends OperatorDesc> p = child.getParentOperators().get(i);
            assert p instanceof ReduceSinkOperator;
            ReduceSinkOperator rsop = (ReduceSinkOperator) p;
            if (bottomReduceSinkOperators.contains(rsop)) {
                hasBottomReduceSinkOperators = true;
                parentsWithMultipleDemux.add(demuxOp);
            } else {
                hasNonBottomReduceSinkOperators = true;
                parentsWithMultipleDemux.add(rsop);
            }
        }
        if (hasBottomReduceSinkOperators && hasNonBottomReduceSinkOperators) {
            child.setParentOperators(parentsWithMultipleDemux);
        } else {
            child.setParentOperators(Utilities.makeList(demuxOp));
        }
    }
    for (Operator<? extends OperatorDesc> parent : parentRSsOfDemux) {
        parent.setChildOperators(Utilities.makeList(demuxOp));
    }
    // replace all ReduceSinkOperators which are not at the bottom of
    // this correlation to MuxOperators
    Set<ReduceSinkOperator> handledRSs = new HashSet<ReduceSinkOperator>();
    for (ReduceSinkOperator rsop : correlation.getAllReduceSinkOperators()) {
        if (!bottomReduceSinkOperators.contains(rsop)) {
            if (handledRSs.contains(rsop)) {
                continue;
            }
            Operator<? extends OperatorDesc> childOP = CorrelationUtilities.getSingleChild(rsop, true);
            if (childOP instanceof GroupByOperator) {
                CorrelationUtilities.removeReduceSinkForGroupBy(rsop, (GroupByOperator) childOP, pCtx, corrCtx);
                List<Operator<? extends OperatorDesc>> parentsOfMux = new ArrayList<Operator<? extends OperatorDesc>>();
                Operator<? extends OperatorDesc> parentOp = CorrelationUtilities.getSingleParent(childOP, true);
                parentsOfMux.add(parentOp);
                Operator<? extends OperatorDesc> mux = OperatorFactory.get(childOP.getCompilationOpContext(), new MuxDesc(parentsOfMux));
                mux.setChildOperators(Utilities.makeList(childOP));
                mux.setParentOperators(parentsOfMux);
                childOP.setParentOperators(Utilities.makeList(mux));
                parentOp.setChildOperators(Utilities.makeList(mux));
            } else {
                List<Operator<? extends OperatorDesc>> parentsOfMux = new ArrayList<Operator<? extends OperatorDesc>>();
                List<Operator<? extends OperatorDesc>> siblingOPs = CorrelationUtilities.findSiblingOperators(rsop);
                for (Operator<? extends OperatorDesc> op : siblingOPs) {
                    if (op instanceof DemuxOperator) {
                        parentsOfMux.add(op);
                    } else if (op instanceof ReduceSinkOperator) {
                        GroupByOperator pGBYm = CorrelationUtilities.getSingleParent(op, GroupByOperator.class);
                        if (pGBYm != null && pGBYm.getConf().getMode() == GroupByDesc.Mode.HASH) {
                            // We get a semi join at here.
                            // This map-side GroupByOperator needs to be removed
                            CorrelationUtilities.removeOperator(pGBYm, op, CorrelationUtilities.getSingleParent(pGBYm, true), pCtx);
                        }
                        handledRSs.add((ReduceSinkOperator) op);
                        parentsOfMux.add(CorrelationUtilities.getSingleParent(op, true));
                    } else {
                        throw new SemanticException("A sibling of ReduceSinkOperator is neither a " + "DemuxOperator nor a ReduceSinkOperator");
                    }
                }
                MuxDesc muxDesc = new MuxDesc(siblingOPs);
                Operator<? extends OperatorDesc> mux = OperatorFactory.get(rsop.getCompilationOpContext(), muxDesc);
                mux.setChildOperators(Utilities.makeList(childOP));
                mux.setParentOperators(parentsOfMux);
                for (Operator<? extends OperatorDesc> op : parentsOfMux) {
                    if (op instanceof DemuxOperator) {
                        // and childOP.
                        if (op.getChildOperators().contains(childOP)) {
                            op.replaceChild(childOP, mux);
                        }
                    } else {
                        // op is not a DemuxOperator, so it should have
                        // a single child.
                        op.setChildOperators(Utilities.makeList(mux));
                    }
                }
                childOP.setParentOperators(Utilities.makeList(mux));
            }
        }
    }
    for (ReduceSinkOperator rsop : handledRSs) {
        rsop.setChildOperators(null);
        rsop.setParentOperators(null);
    }
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) DemuxOperator(org.apache.hadoop.hive.ql.exec.DemuxOperator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) GroupByOperator(org.apache.hadoop.hive.ql.exec.GroupByOperator) MuxDesc(org.apache.hadoop.hive.ql.plan.MuxDesc) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) DemuxDesc(org.apache.hadoop.hive.ql.plan.DemuxDesc) ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc) OperatorDesc(org.apache.hadoop.hive.ql.plan.OperatorDesc) DemuxOperator(org.apache.hadoop.hive.ql.exec.DemuxOperator) HashSet(java.util.HashSet) SemanticException(org.apache.hadoop.hive.ql.parse.SemanticException)

Example 22 with CompilationOpContext

use of org.apache.hadoop.hive.ql.CompilationOpContext in project hive by apache.

the class SerializationUtilities method clonePlan.

/**
   * Clones using the powers of XML. Do not use unless necessary.
   * @param plan The plan.
   * @return The clone.
   */
public static MapredWork clonePlan(MapredWork plan) {
    // TODO: need proper clone. Meanwhile, let's at least keep this horror in one place
    PerfLogger perfLogger = SessionState.getPerfLogger();
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.CLONE_PLAN);
    Operator<?> op = plan.getAnyOperator();
    CompilationOpContext ctx = (op == null) ? null : op.getCompilationOpContext();
    ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
    serializePlan(plan, baos, true);
    MapredWork newPlan = deserializePlan(new ByteArrayInputStream(baos.toByteArray()), MapredWork.class, true);
    // Restore the context.
    for (Operator<?> newOp : newPlan.getAllOperators()) {
        newOp.setCompilationOpContext(ctx);
    }
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.CLONE_PLAN);
    return newPlan;
}
Also used : MapredWork(org.apache.hadoop.hive.ql.plan.MapredWork) ByteArrayInputStream(java.io.ByteArrayInputStream) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) PerfLogger(org.apache.hadoop.hive.ql.log.PerfLogger) ByteArrayOutputStream(java.io.ByteArrayOutputStream)

Example 23 with CompilationOpContext

use of org.apache.hadoop.hive.ql.CompilationOpContext in project hive by apache.

the class SerializationUtilities method cloneBaseWork.

/**
   * Clones using the powers of XML. Do not use unless necessary.
   * @param plan The plan.
   * @return The clone.
   */
public static BaseWork cloneBaseWork(BaseWork plan) {
    PerfLogger perfLogger = SessionState.getPerfLogger();
    perfLogger.PerfLogBegin(CLASS_NAME, PerfLogger.CLONE_PLAN);
    Operator<?> op = plan.getAnyRootOperator();
    CompilationOpContext ctx = (op == null) ? null : op.getCompilationOpContext();
    ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
    serializePlan(plan, baos, true);
    BaseWork newPlan = deserializePlan(new ByteArrayInputStream(baos.toByteArray()), plan.getClass(), true);
    // Restore the context.
    for (Operator<?> newOp : newPlan.getAllOperators()) {
        newOp.setCompilationOpContext(ctx);
    }
    perfLogger.PerfLogEnd(CLASS_NAME, PerfLogger.CLONE_PLAN);
    return newPlan;
}
Also used : ByteArrayInputStream(java.io.ByteArrayInputStream) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) PerfLogger(org.apache.hadoop.hive.ql.log.PerfLogger) ByteArrayOutputStream(java.io.ByteArrayOutputStream) BaseWork(org.apache.hadoop.hive.ql.plan.BaseWork)

Example 24 with CompilationOpContext

use of org.apache.hadoop.hive.ql.CompilationOpContext in project hive by apache.

the class TestHCatMultiOutputFormat method getTableData.

/**
   * Method to fetch table data
   *
   * @param table table name
   * @param database database
   * @return list of columns in comma seperated way
   * @throws Exception if any error occurs
   */
private List<String> getTableData(String table, String database) throws Exception {
    QueryState queryState = new QueryState(null);
    HiveConf conf = queryState.getConf();
    conf.addResource("hive-site.xml");
    ArrayList<String> results = new ArrayList<String>();
    ArrayList<String> temp = new ArrayList<String>();
    Hive hive = Hive.get(conf);
    org.apache.hadoop.hive.ql.metadata.Table tbl = hive.getTable(database, table);
    FetchWork work;
    if (!tbl.getPartCols().isEmpty()) {
        List<Partition> partitions = hive.getPartitions(tbl);
        List<PartitionDesc> partDesc = new ArrayList<PartitionDesc>();
        List<Path> partLocs = new ArrayList<Path>();
        TableDesc tableDesc = Utilities.getTableDesc(tbl);
        for (Partition part : partitions) {
            partLocs.add(part.getDataLocation());
            partDesc.add(Utilities.getPartitionDescFromTableDesc(tableDesc, part, true));
        }
        work = new FetchWork(partLocs, partDesc, tableDesc);
        work.setLimit(100);
    } else {
        work = new FetchWork(tbl.getDataLocation(), Utilities.getTableDesc(tbl));
    }
    FetchTask task = new FetchTask();
    task.setWork(work);
    task.initialize(queryState, null, null, new CompilationOpContext());
    task.fetch(temp);
    for (String str : temp) {
        results.add(str.replace("\t", ","));
    }
    return results;
}
Also used : Path(org.apache.hadoop.fs.Path) Partition(org.apache.hadoop.hive.ql.metadata.Partition) ArrayList(java.util.ArrayList) QueryState(org.apache.hadoop.hive.ql.QueryState) FetchTask(org.apache.hadoop.hive.ql.exec.FetchTask) Hive(org.apache.hadoop.hive.ql.metadata.Hive) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) FetchWork(org.apache.hadoop.hive.ql.plan.FetchWork) HiveConf(org.apache.hadoop.hive.conf.HiveConf) PartitionDesc(org.apache.hadoop.hive.ql.plan.PartitionDesc) TableDesc(org.apache.hadoop.hive.ql.plan.TableDesc)

Example 25 with CompilationOpContext

use of org.apache.hadoop.hive.ql.CompilationOpContext in project hive by apache.

the class SerializationUtilities method cloneOperatorTree.

public static List<Operator<?>> cloneOperatorTree(List<Operator<?>> roots, int indexForTezUnion) {
    ByteArrayOutputStream baos = new ByteArrayOutputStream(4096);
    CompilationOpContext ctx = roots.isEmpty() ? null : roots.get(0).getCompilationOpContext();
    serializePlan(roots, baos, true);
    @SuppressWarnings("unchecked") List<Operator<?>> result = deserializePlan(new ByteArrayInputStream(baos.toByteArray()), roots.getClass(), true);
    // Restore the context.
    LinkedList<Operator<?>> newOps = new LinkedList<>(result);
    while (!newOps.isEmpty()) {
        Operator<?> newOp = newOps.poll();
        newOp.setIndexForTezUnion(indexForTezUnion);
        newOp.setCompilationOpContext(ctx);
        List<Operator<?>> children = newOp.getChildOperators();
        if (children != null) {
            newOps.addAll(children);
        }
    }
    return result;
}
Also used : VectorFileSinkOperator(org.apache.hadoop.hive.ql.exec.vector.VectorFileSinkOperator) ByteArrayInputStream(java.io.ByteArrayInputStream) CompilationOpContext(org.apache.hadoop.hive.ql.CompilationOpContext) ByteArrayOutputStream(java.io.ByteArrayOutputStream) LinkedList(java.util.LinkedList)

Aggregations

CompilationOpContext (org.apache.hadoop.hive.ql.CompilationOpContext)40 ArrayList (java.util.ArrayList)25 JobConf (org.apache.hadoop.mapred.JobConf)12 FakeCaptureOutputOperator (org.apache.hadoop.hive.ql.exec.vector.util.FakeCaptureOutputOperator)10 GroupByDesc (org.apache.hadoop.hive.ql.plan.GroupByDesc)10 OperatorDesc (org.apache.hadoop.hive.ql.plan.OperatorDesc)10 VectorGroupByDesc (org.apache.hadoop.hive.ql.plan.VectorGroupByDesc)10 Operator (org.apache.hadoop.hive.ql.exec.Operator)8 HashMap (java.util.HashMap)7 LinkedHashMap (java.util.LinkedHashMap)7 ExprNodeDesc (org.apache.hadoop.hive.ql.plan.ExprNodeDesc)7 TableDesc (org.apache.hadoop.hive.ql.plan.TableDesc)7 ByteArrayOutputStream (java.io.ByteArrayOutputStream)5 HashSet (java.util.HashSet)5 Path (org.apache.hadoop.fs.Path)5 ByteArrayInputStream (java.io.ByteArrayInputStream)4 Set (java.util.Set)4 Configuration (org.apache.hadoop.conf.Configuration)4 MapWork (org.apache.hadoop.hive.ql.plan.MapWork)4 Test (org.junit.Test)4