use of org.apache.crunch.impl.mr.collect.PGroupedTableImpl in project crunch by cloudera.
the class MSCRPlanner method plan.
public MRExecutor plan(Class<?> jarClass, Configuration conf) throws IOException {
// Constructs all of the node paths, which either start w/an input
// or a GBK and terminate in an output collection of any type.
NodeVisitor visitor = new NodeVisitor();
for (PCollectionImpl<?> output : outputs.keySet()) {
visitor.visitOutput(output);
}
// Pull out the node paths.
Map<PCollectionImpl<?>, Set<NodePath>> nodePaths = visitor.getNodePaths();
// Keeps track of the dependencies from collections -> jobs and then
// between different jobs.
Map<PCollectionImpl<?>, JobPrototype> assignments = Maps.newHashMap();
Map<PCollectionImpl<?>, Set<JobPrototype>> jobDependencies = new HashMap<PCollectionImpl<?>, Set<JobPrototype>>();
// Find the set of GBKs that DO NOT depend on any other GBK.
Set<PGroupedTableImpl<?, ?>> workingGroupings = null;
while (!(workingGroupings = getWorkingGroupings(nodePaths)).isEmpty()) {
for (PGroupedTableImpl<?, ?> grouping : workingGroupings) {
Set<NodePath> mapInputPaths = nodePaths.get(grouping);
JobPrototype proto = JobPrototype.createMapReduceJob(grouping, mapInputPaths, pipeline.createTempPath());
assignments.put(grouping, proto);
if (jobDependencies.containsKey(grouping)) {
for (JobPrototype dependency : jobDependencies.get(grouping)) {
proto.addDependency(dependency);
}
}
}
Map<PGroupedTableImpl<?, ?>, Set<NodePath>> dependencyPaths = getDependencyPaths(workingGroupings, nodePaths);
for (Map.Entry<PGroupedTableImpl<?, ?>, Set<NodePath>> entry : dependencyPaths.entrySet()) {
PGroupedTableImpl<?, ?> grouping = entry.getKey();
Set<NodePath> currentNodePaths = entry.getValue();
JobPrototype proto = assignments.get(grouping);
Set<NodePath> gbkPaths = Sets.newHashSet();
for (NodePath nodePath : currentNodePaths) {
PCollectionImpl<?> tail = nodePath.tail();
if (tail instanceof PGroupedTableImpl) {
gbkPaths.add(nodePath);
if (!jobDependencies.containsKey(tail)) {
jobDependencies.put(tail, Sets.<JobPrototype>newHashSet());
}
jobDependencies.get(tail).add(proto);
}
}
if (!gbkPaths.isEmpty()) {
handleGroupingDependencies(gbkPaths, currentNodePaths);
}
// At this point, all of the dependencies for the working groups will be
// file outputs, and so we can add them all to the JobPrototype-- we now have
// a complete job.
HashMultimap<Target, NodePath> reduceOutputs = HashMultimap.create();
for (NodePath nodePath : currentNodePaths) {
assignments.put(nodePath.tail(), proto);
for (Target target : outputs.get(nodePath.tail())) {
reduceOutputs.put(target, nodePath);
}
}
proto.addReducePaths(reduceOutputs);
// We've processed this GBK-- remove it from the set of nodePaths we
// need to process in the next step.
nodePaths.remove(grouping);
}
}
// Process any map-only jobs that are remaining.
if (!nodePaths.isEmpty()) {
for (Map.Entry<PCollectionImpl<?>, Set<NodePath>> entry : nodePaths.entrySet()) {
PCollectionImpl<?> collect = entry.getKey();
if (!assignments.containsKey(collect)) {
HashMultimap<Target, NodePath> mapOutputs = HashMultimap.create();
for (NodePath nodePath : entry.getValue()) {
for (Target target : outputs.get(nodePath.tail())) {
mapOutputs.put(target, nodePath);
}
}
JobPrototype proto = JobPrototype.createMapOnlyJob(mapOutputs, pipeline.createTempPath());
if (jobDependencies.containsKey(collect)) {
for (JobPrototype dependency : jobDependencies.get(collect)) {
proto.addDependency(dependency);
}
}
assignments.put(collect, proto);
}
}
}
MRExecutor exec = new MRExecutor(jarClass);
for (JobPrototype proto : Sets.newHashSet(assignments.values())) {
exec.addJob(proto.getCrunchJob(jarClass, conf, pipeline));
}
return exec;
}
Aggregations