Search in sources :

Example 1 with Cluster

use of org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph.Cluster in project hive by apache.

the class ParallelEdgeFixer method fixParallelEdges.

private void fixParallelEdges(OperatorGraph og) throws SemanticException {
    // Identify edge operators
    ListValuedMap<Pair<Cluster, Cluster>, Pair<Operator<?>, Operator<?>>> edgeOperators = new ArrayListValuedHashMap<>();
    for (Cluster c : og.getClusters()) {
        for (Operator<?> o : c.getMembers()) {
            for (Operator<? extends OperatorDesc> p : o.getParentOperators()) {
                Cluster parentCluster = og.clusterOf(p);
                if (parentCluster == c) {
                    continue;
                }
                edgeOperators.put(new Pair<>(parentCluster, c), new Pair<>(p, o));
            }
        }
    }
    // process all edges and fix parallel edges if there are any
    for (Pair<Cluster, Cluster> key : edgeOperators.keySet()) {
        List<Pair<Operator<?>, Operator<?>>> values = edgeOperators.get(key);
        if (values.size() <= 1) {
            continue;
        }
        // operator order must in stabile order - or we end up with falky plans causing flaky tests...
        values.sort(new OperatorPairComparator());
        // remove one optionally unsupported edge (it will be kept as is)
        removeOneEdge(values);
        Iterator<Pair<Operator<?>, Operator<?>>> it = values.iterator();
        while (it.hasNext()) {
            Pair<Operator<?>, Operator<?>> pair = it.next();
            fixParallelEdge(pair.left, pair.right);
        }
    }
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) Cluster(org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph.Cluster) ArrayListValuedHashMap(org.apache.commons.collections4.multimap.ArrayListValuedHashMap) Pair(org.apache.calcite.util.Pair)

Example 2 with Cluster

use of org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph.Cluster in project hive by apache.

the class DotExporter method write.

public void write(File outFile) throws Exception {
    Map<Operator<?>, Cluster> nodeCluster = operatorGraph.nodeCluster;
    DagGraph<Operator<?>, OpEdge> g = operatorGraph.g;
    PrintWriter writer = new PrintWriter(outFile);
    writer.println("digraph G");
    writer.println("{\n");
    HashSet<Cluster> clusters = new HashSet<>(nodeCluster.values());
    int idx = 0;
    for (Cluster cluster : clusters) {
        idx++;
        writer.printf("subgraph cluster_%d {\n", idx);
        for (Operator<?> member : cluster.members) {
            writer.printf("%s;\n", nodeName(member));
        }
        writer.printf("label = \"cluster %d\";\n", idx);
        writer.printf("}\n");
    }
    Set<Operator<?>> nodes = g.nodes();
    for (Operator<?> n : nodes) {
        writer.printf("%s[shape=record,label=\"%s\",%s];\n", nodeName(n), nodeLabel(n), style(n));
        Set<Operator<?>> succ = g.successors(n);
        for (Operator<?> s : succ) {
            Optional<OpEdge> e = g.getEdge(n, s);
            String style = "";
            switch(e.get().getEdgeType()) {
                case BROADCAST:
                    style = "[color=blue,label=\"BROADCAST\"]";
                    break;
                case DPP:
                    style = "[color=green,label=\"DPP\"]";
                    break;
                case SEMIJOIN:
                    style = "[color=red,label=\"SEMIJOIN\"]";
                    break;
            }
            writer.printf("%s->%s%s;\n", nodeName(n), nodeName(s), style);
        }
    }
    writer.println("}\n");
    writer.close();
}
Also used : FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) OpEdge(org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph.OpEdge) Cluster(org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph.Cluster) PrintWriter(java.io.PrintWriter) HashSet(java.util.HashSet)

Example 3 with Cluster

use of org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph.Cluster in project hive by apache.

the class SharedWorkOptimizer method validPreConditions.

private static boolean validPreConditions(ParseContext pctx, SharedWorkOptimizerCache optimizerCache, SharedResult sr) {
    // TODO: Currently ignores GBY and PTF which may also buffer data in memory.
    if (sr.dataSize > sr.maxDataSize) {
        // Size surpasses limit, we cannot convert
        LOG.debug("accumulated data size: {} / max size: {}", sr.dataSize, sr.maxDataSize);
        return false;
    }
    Operator<?> op1 = sr.retainableOps.get(0);
    Operator<?> op2 = sr.discardableOps.get(0);
    // 1) The set of operators in the works that we are merging need to meet
    // some requirements. In particular:
    // 1.1. None of the works that we are merging can contain a Union
    // operator. This is not supported yet as we might end up with cycles in
    // the Tez DAG.
    // 1.2. There cannot be any DummyStore operator in the works being merged.
    // This is due to an assumption in MergeJoinProc that needs to be further explored.
    // This is also due to some assumption in task generation
    // If any of these conditions are not met, we cannot merge.
    // TODO: Extend rule so it can be applied for these cases.
    final Set<Operator<?>> workOps1 = findWorkOperators(optimizerCache, op1);
    final Set<Operator<?>> workOps2 = findWorkOperators(optimizerCache, op2);
    for (Operator<?> op : workOps1) {
        if (op instanceof UnionOperator) {
            // We cannot merge (1.1)
            return false;
        }
        if (op instanceof DummyStoreOperator) {
            // We cannot merge (1.2)
            return false;
        }
    }
    for (Operator<?> op : workOps2) {
        if (op instanceof UnionOperator) {
            // We cannot merge (1.1)
            return false;
        }
        if (op instanceof DummyStoreOperator) {
            // We cannot merge (1.2)
            return false;
        }
    }
    // 2) We check whether one of the operators is part of a work that is an input for
    // the work of the other operator.
    // 
    // Work1            (merge TS in W1 & W3)        Work1
    // |                        ->                   |        X
    // Work2                                         Work2
    // |                                             |
    // Work3                                         Work1
    // 
    // If we do, we cannot merge, as we would end up with a cycle in the DAG.
    final Set<Operator<?>> descendantWorksOps1 = findDescendantWorkOperators(pctx, optimizerCache, op1, sr.discardableInputOps);
    final Set<Operator<?>> descendantWorksOps2 = findDescendantWorkOperators(pctx, optimizerCache, op2, sr.discardableInputOps);
    if (!Collections.disjoint(descendantWorksOps1, workOps2) || !Collections.disjoint(workOps1, descendantWorksOps2)) {
        return false;
    }
    // 3) We check whether output works when we merge the operators will collide.
    // 
    // Work1   Work2    (merge TS in W1 & W2)        Work1
    // \   /                  ->                  | |       X
    // Work3                                     Work3
    // 
    // If we do, we cannot merge. The reason is that Tez currently does
    // not support parallel edges, i.e., multiple edges from same work x
    // into same work y.
    RelaxedVertexEdgePredicate edgePredicate;
    if (pctx.getConf().getBoolVar(ConfVars.HIVE_SHARED_WORK_PARALLEL_EDGE_SUPPORT)) {
        edgePredicate = new RelaxedVertexEdgePredicate(EnumSet.<EdgeType>of(EdgeType.DPP, EdgeType.SEMIJOIN, EdgeType.BROADCAST));
    } else {
        edgePredicate = new RelaxedVertexEdgePredicate(EnumSet.<EdgeType>of(EdgeType.DPP));
    }
    OperatorGraph og = new OperatorGraph(pctx);
    Set<OperatorGraph.Cluster> cc1 = og.clusterOf(op1).childClusters(edgePredicate);
    Set<OperatorGraph.Cluster> cc2 = og.clusterOf(op2).childClusters(edgePredicate);
    if (!Collections.disjoint(cc1, cc2)) {
        LOG.debug("merge would create an unsupported parallel edge(CHILDS)", op1, op2);
        return false;
    }
    if (!og.mayMerge(op1, op2)) {
        LOG.debug("merging {} and {} would violate dag properties", op1, op2);
        return false;
    }
    // 4) We check whether we will end up with same operators inputing on same work.
    // 
    // Work1        (merge TS in W2 & W3)        Work1
    // /   \                  ->                  | |       X
    // Work2   Work3                                 Work2
    // 
    // If we do, we cannot merge. The reason is the same as above, currently
    // Tez does not support parallel edges.
    // 
    // In the check, we exclude the inputs to the root operator that we are trying
    // to merge (only useful for extended merging as TS do not have inputs).
    Set<OperatorGraph.Cluster> pc1 = og.clusterOf(op1).parentClusters(edgePredicate);
    Set<OperatorGraph.Cluster> pc2 = og.clusterOf(op2).parentClusters(edgePredicate);
    Set<Cluster> pc = new HashSet<>(Sets.intersection(pc1, pc2));
    for (Operator<?> o : sr.discardableOps.get(0).getParentOperators()) {
        pc.remove(og.clusterOf(o));
    }
    for (Operator<?> o : sr.discardableInputOps) {
        pc.remove(og.clusterOf(o));
    }
    if (pc.size() > 0) {
        LOG.debug("merge would create an unsupported parallel edge(PARENTS)", op1, op2);
        return false;
    }
    return true;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) Cluster(org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph.Cluster) OperatorGraph(org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph) EdgeType(org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph.EdgeType) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Aggregations

Operator (org.apache.hadoop.hive.ql.exec.Operator)3 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)3 Cluster (org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph.Cluster)3 HashSet (java.util.HashSet)2 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)2 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)2 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)2 PrintWriter (java.io.PrintWriter)1 LinkedHashSet (java.util.LinkedHashSet)1 Pair (org.apache.calcite.util.Pair)1 ArrayListValuedHashMap (org.apache.commons.collections4.multimap.ArrayListValuedHashMap)1 AppMasterEventOperator (org.apache.hadoop.hive.ql.exec.AppMasterEventOperator)1 DummyStoreOperator (org.apache.hadoop.hive.ql.exec.DummyStoreOperator)1 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)1 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)1 OperatorGraph (org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph)1 EdgeType (org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph.EdgeType)1 OpEdge (org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph.OpEdge)1