Search in sources :

Example 1 with OperatorGraph

use of org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph in project hive by apache.

the class SharedWorkOptimizer method validPreConditions.

private static boolean validPreConditions(ParseContext pctx, SharedWorkOptimizerCache optimizerCache, SharedResult sr) {
    // TODO: Currently ignores GBY and PTF which may also buffer data in memory.
    if (sr.dataSize > sr.maxDataSize) {
        // Size surpasses limit, we cannot convert
        LOG.debug("accumulated data size: {} / max size: {}", sr.dataSize, sr.maxDataSize);
        return false;
    }
    Operator<?> op1 = sr.retainableOps.get(0);
    Operator<?> op2 = sr.discardableOps.get(0);
    // 1) The set of operators in the works that we are merging need to meet
    // some requirements. In particular:
    // 1.1. None of the works that we are merging can contain a Union
    // operator. This is not supported yet as we might end up with cycles in
    // the Tez DAG.
    // 1.2. There cannot be any DummyStore operator in the works being merged.
    // This is due to an assumption in MergeJoinProc that needs to be further explored.
    // This is also due to some assumption in task generation
    // If any of these conditions are not met, we cannot merge.
    // TODO: Extend rule so it can be applied for these cases.
    final Set<Operator<?>> workOps1 = findWorkOperators(optimizerCache, op1);
    final Set<Operator<?>> workOps2 = findWorkOperators(optimizerCache, op2);
    for (Operator<?> op : workOps1) {
        if (op instanceof UnionOperator) {
            // We cannot merge (1.1)
            return false;
        }
        if (op instanceof DummyStoreOperator) {
            // We cannot merge (1.2)
            return false;
        }
    }
    for (Operator<?> op : workOps2) {
        if (op instanceof UnionOperator) {
            // We cannot merge (1.1)
            return false;
        }
        if (op instanceof DummyStoreOperator) {
            // We cannot merge (1.2)
            return false;
        }
    }
    // 2) We check whether one of the operators is part of a work that is an input for
    // the work of the other operator.
    // 
    // Work1            (merge TS in W1 & W3)        Work1
    // |                        ->                   |        X
    // Work2                                         Work2
    // |                                             |
    // Work3                                         Work1
    // 
    // If we do, we cannot merge, as we would end up with a cycle in the DAG.
    final Set<Operator<?>> descendantWorksOps1 = findDescendantWorkOperators(pctx, optimizerCache, op1, sr.discardableInputOps);
    final Set<Operator<?>> descendantWorksOps2 = findDescendantWorkOperators(pctx, optimizerCache, op2, sr.discardableInputOps);
    if (!Collections.disjoint(descendantWorksOps1, workOps2) || !Collections.disjoint(workOps1, descendantWorksOps2)) {
        return false;
    }
    // 3) We check whether output works when we merge the operators will collide.
    // 
    // Work1   Work2    (merge TS in W1 & W2)        Work1
    // \   /                  ->                  | |       X
    // Work3                                     Work3
    // 
    // If we do, we cannot merge. The reason is that Tez currently does
    // not support parallel edges, i.e., multiple edges from same work x
    // into same work y.
    RelaxedVertexEdgePredicate edgePredicate;
    if (pctx.getConf().getBoolVar(ConfVars.HIVE_SHARED_WORK_PARALLEL_EDGE_SUPPORT)) {
        edgePredicate = new RelaxedVertexEdgePredicate(EnumSet.<EdgeType>of(EdgeType.DPP, EdgeType.SEMIJOIN, EdgeType.BROADCAST));
    } else {
        edgePredicate = new RelaxedVertexEdgePredicate(EnumSet.<EdgeType>of(EdgeType.DPP));
    }
    OperatorGraph og = new OperatorGraph(pctx);
    Set<OperatorGraph.Cluster> cc1 = og.clusterOf(op1).childClusters(edgePredicate);
    Set<OperatorGraph.Cluster> cc2 = og.clusterOf(op2).childClusters(edgePredicate);
    if (!Collections.disjoint(cc1, cc2)) {
        LOG.debug("merge would create an unsupported parallel edge(CHILDS)", op1, op2);
        return false;
    }
    if (!og.mayMerge(op1, op2)) {
        LOG.debug("merging {} and {} would violate dag properties", op1, op2);
        return false;
    }
    // 4) We check whether we will end up with same operators inputing on same work.
    // 
    // Work1        (merge TS in W2 & W3)        Work1
    // /   \                  ->                  | |       X
    // Work2   Work3                                 Work2
    // 
    // If we do, we cannot merge. The reason is the same as above, currently
    // Tez does not support parallel edges.
    // 
    // In the check, we exclude the inputs to the root operator that we are trying
    // to merge (only useful for extended merging as TS do not have inputs).
    Set<OperatorGraph.Cluster> pc1 = og.clusterOf(op1).parentClusters(edgePredicate);
    Set<OperatorGraph.Cluster> pc2 = og.clusterOf(op2).parentClusters(edgePredicate);
    Set<Cluster> pc = new HashSet<>(Sets.intersection(pc1, pc2));
    for (Operator<?> o : sr.discardableOps.get(0).getParentOperators()) {
        pc.remove(og.clusterOf(o));
    }
    for (Operator<?> o : sr.discardableInputOps) {
        pc.remove(og.clusterOf(o));
    }
    if (pc.size() > 0) {
        LOG.debug("merge would create an unsupported parallel edge(PARENTS)", op1, op2);
        return false;
    }
    return true;
}
Also used : ReduceSinkOperator(org.apache.hadoop.hive.ql.exec.ReduceSinkOperator) MapJoinOperator(org.apache.hadoop.hive.ql.exec.MapJoinOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) FilterOperator(org.apache.hadoop.hive.ql.exec.FilterOperator) AppMasterEventOperator(org.apache.hadoop.hive.ql.exec.AppMasterEventOperator) JoinOperator(org.apache.hadoop.hive.ql.exec.JoinOperator) TableScanOperator(org.apache.hadoop.hive.ql.exec.TableScanOperator) Operator(org.apache.hadoop.hive.ql.exec.Operator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) UnionOperator(org.apache.hadoop.hive.ql.exec.UnionOperator) DummyStoreOperator(org.apache.hadoop.hive.ql.exec.DummyStoreOperator) Cluster(org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph.Cluster) OperatorGraph(org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph) EdgeType(org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph.EdgeType) HashSet(java.util.HashSet) LinkedHashSet(java.util.LinkedHashSet)

Aggregations

HashSet (java.util.HashSet)1 LinkedHashSet (java.util.LinkedHashSet)1 AppMasterEventOperator (org.apache.hadoop.hive.ql.exec.AppMasterEventOperator)1 DummyStoreOperator (org.apache.hadoop.hive.ql.exec.DummyStoreOperator)1 FilterOperator (org.apache.hadoop.hive.ql.exec.FilterOperator)1 JoinOperator (org.apache.hadoop.hive.ql.exec.JoinOperator)1 MapJoinOperator (org.apache.hadoop.hive.ql.exec.MapJoinOperator)1 Operator (org.apache.hadoop.hive.ql.exec.Operator)1 ReduceSinkOperator (org.apache.hadoop.hive.ql.exec.ReduceSinkOperator)1 TableScanOperator (org.apache.hadoop.hive.ql.exec.TableScanOperator)1 UnionOperator (org.apache.hadoop.hive.ql.exec.UnionOperator)1 OperatorGraph (org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph)1 Cluster (org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph.Cluster)1 EdgeType (org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph.EdgeType)1