use of org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph.Cluster in project hive by apache.
the class ParallelEdgeFixer method fixParallelEdges.
private void fixParallelEdges(OperatorGraph og) throws SemanticException {
// Identify edge operators
ListValuedMap<Pair<Cluster, Cluster>, Pair<Operator<?>, Operator<?>>> edgeOperators = new ArrayListValuedHashMap<>();
for (Cluster c : og.getClusters()) {
for (Operator<?> o : c.getMembers()) {
for (Operator<? extends OperatorDesc> p : o.getParentOperators()) {
Cluster parentCluster = og.clusterOf(p);
if (parentCluster == c) {
continue;
}
edgeOperators.put(new Pair<>(parentCluster, c), new Pair<>(p, o));
}
}
}
// process all edges and fix parallel edges if there are any
for (Pair<Cluster, Cluster> key : edgeOperators.keySet()) {
List<Pair<Operator<?>, Operator<?>>> values = edgeOperators.get(key);
if (values.size() <= 1) {
continue;
}
// operator order must in stabile order - or we end up with falky plans causing flaky tests...
values.sort(new OperatorPairComparator());
// remove one optionally unsupported edge (it will be kept as is)
removeOneEdge(values);
Iterator<Pair<Operator<?>, Operator<?>>> it = values.iterator();
while (it.hasNext()) {
Pair<Operator<?>, Operator<?>> pair = it.next();
fixParallelEdge(pair.left, pair.right);
}
}
}
use of org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph.Cluster in project hive by apache.
the class DotExporter method write.
public void write(File outFile) throws Exception {
Map<Operator<?>, Cluster> nodeCluster = operatorGraph.nodeCluster;
DagGraph<Operator<?>, OpEdge> g = operatorGraph.g;
PrintWriter writer = new PrintWriter(outFile);
writer.println("digraph G");
writer.println("{\n");
HashSet<Cluster> clusters = new HashSet<>(nodeCluster.values());
int idx = 0;
for (Cluster cluster : clusters) {
idx++;
writer.printf("subgraph cluster_%d {\n", idx);
for (Operator<?> member : cluster.members) {
writer.printf("%s;\n", nodeName(member));
}
writer.printf("label = \"cluster %d\";\n", idx);
writer.printf("}\n");
}
Set<Operator<?>> nodes = g.nodes();
for (Operator<?> n : nodes) {
writer.printf("%s[shape=record,label=\"%s\",%s];\n", nodeName(n), nodeLabel(n), style(n));
Set<Operator<?>> succ = g.successors(n);
for (Operator<?> s : succ) {
Optional<OpEdge> e = g.getEdge(n, s);
String style = "";
switch(e.get().getEdgeType()) {
case BROADCAST:
style = "[color=blue,label=\"BROADCAST\"]";
break;
case DPP:
style = "[color=green,label=\"DPP\"]";
break;
case SEMIJOIN:
style = "[color=red,label=\"SEMIJOIN\"]";
break;
}
writer.printf("%s->%s%s;\n", nodeName(n), nodeName(s), style);
}
}
writer.println("}\n");
writer.close();
}
use of org.apache.hadoop.hive.ql.optimizer.graph.OperatorGraph.Cluster in project hive by apache.
the class SharedWorkOptimizer method validPreConditions.
private static boolean validPreConditions(ParseContext pctx, SharedWorkOptimizerCache optimizerCache, SharedResult sr) {
// TODO: Currently ignores GBY and PTF which may also buffer data in memory.
if (sr.dataSize > sr.maxDataSize) {
// Size surpasses limit, we cannot convert
LOG.debug("accumulated data size: {} / max size: {}", sr.dataSize, sr.maxDataSize);
return false;
}
Operator<?> op1 = sr.retainableOps.get(0);
Operator<?> op2 = sr.discardableOps.get(0);
// 1) The set of operators in the works that we are merging need to meet
// some requirements. In particular:
// 1.1. None of the works that we are merging can contain a Union
// operator. This is not supported yet as we might end up with cycles in
// the Tez DAG.
// 1.2. There cannot be any DummyStore operator in the works being merged.
// This is due to an assumption in MergeJoinProc that needs to be further explored.
// This is also due to some assumption in task generation
// If any of these conditions are not met, we cannot merge.
// TODO: Extend rule so it can be applied for these cases.
final Set<Operator<?>> workOps1 = findWorkOperators(optimizerCache, op1);
final Set<Operator<?>> workOps2 = findWorkOperators(optimizerCache, op2);
for (Operator<?> op : workOps1) {
if (op instanceof UnionOperator) {
// We cannot merge (1.1)
return false;
}
if (op instanceof DummyStoreOperator) {
// We cannot merge (1.2)
return false;
}
}
for (Operator<?> op : workOps2) {
if (op instanceof UnionOperator) {
// We cannot merge (1.1)
return false;
}
if (op instanceof DummyStoreOperator) {
// We cannot merge (1.2)
return false;
}
}
// 2) We check whether one of the operators is part of a work that is an input for
// the work of the other operator.
//
// Work1 (merge TS in W1 & W3) Work1
// | -> | X
// Work2 Work2
// | |
// Work3 Work1
//
// If we do, we cannot merge, as we would end up with a cycle in the DAG.
final Set<Operator<?>> descendantWorksOps1 = findDescendantWorkOperators(pctx, optimizerCache, op1, sr.discardableInputOps);
final Set<Operator<?>> descendantWorksOps2 = findDescendantWorkOperators(pctx, optimizerCache, op2, sr.discardableInputOps);
if (!Collections.disjoint(descendantWorksOps1, workOps2) || !Collections.disjoint(workOps1, descendantWorksOps2)) {
return false;
}
// 3) We check whether output works when we merge the operators will collide.
//
// Work1 Work2 (merge TS in W1 & W2) Work1
// \ / -> | | X
// Work3 Work3
//
// If we do, we cannot merge. The reason is that Tez currently does
// not support parallel edges, i.e., multiple edges from same work x
// into same work y.
RelaxedVertexEdgePredicate edgePredicate;
if (pctx.getConf().getBoolVar(ConfVars.HIVE_SHARED_WORK_PARALLEL_EDGE_SUPPORT)) {
edgePredicate = new RelaxedVertexEdgePredicate(EnumSet.<EdgeType>of(EdgeType.DPP, EdgeType.SEMIJOIN, EdgeType.BROADCAST));
} else {
edgePredicate = new RelaxedVertexEdgePredicate(EnumSet.<EdgeType>of(EdgeType.DPP));
}
OperatorGraph og = new OperatorGraph(pctx);
Set<OperatorGraph.Cluster> cc1 = og.clusterOf(op1).childClusters(edgePredicate);
Set<OperatorGraph.Cluster> cc2 = og.clusterOf(op2).childClusters(edgePredicate);
if (!Collections.disjoint(cc1, cc2)) {
LOG.debug("merge would create an unsupported parallel edge(CHILDS)", op1, op2);
return false;
}
if (!og.mayMerge(op1, op2)) {
LOG.debug("merging {} and {} would violate dag properties", op1, op2);
return false;
}
// 4) We check whether we will end up with same operators inputing on same work.
//
// Work1 (merge TS in W2 & W3) Work1
// / \ -> | | X
// Work2 Work3 Work2
//
// If we do, we cannot merge. The reason is the same as above, currently
// Tez does not support parallel edges.
//
// In the check, we exclude the inputs to the root operator that we are trying
// to merge (only useful for extended merging as TS do not have inputs).
Set<OperatorGraph.Cluster> pc1 = og.clusterOf(op1).parentClusters(edgePredicate);
Set<OperatorGraph.Cluster> pc2 = og.clusterOf(op2).parentClusters(edgePredicate);
Set<Cluster> pc = new HashSet<>(Sets.intersection(pc1, pc2));
for (Operator<?> o : sr.discardableOps.get(0).getParentOperators()) {
pc.remove(og.clusterOf(o));
}
for (Operator<?> o : sr.discardableInputOps) {
pc.remove(og.clusterOf(o));
}
if (pc.size() > 0) {
LOG.debug("merge would create an unsupported parallel edge(PARENTS)", op1, op2);
return false;
}
return true;
}
Aggregations