Search in sources :

Example 1 with MRExecutor

use of org.apache.crunch.impl.mr.exec.MRExecutor in project crunch by cloudera.

the class MSCRPlanner method plan.

public MRExecutor plan(Class<?> jarClass, Configuration conf) throws IOException {
    // Constructs all of the node paths, which either start w/an input
    // or a GBK and terminate in an output collection of any type.
    NodeVisitor visitor = new NodeVisitor();
    for (PCollectionImpl<?> output : outputs.keySet()) {
        visitor.visitOutput(output);
    }
    // Pull out the node paths.
    Map<PCollectionImpl<?>, Set<NodePath>> nodePaths = visitor.getNodePaths();
    // Keeps track of the dependencies from collections -> jobs and then
    // between different jobs.
    Map<PCollectionImpl<?>, JobPrototype> assignments = Maps.newHashMap();
    Map<PCollectionImpl<?>, Set<JobPrototype>> jobDependencies = new HashMap<PCollectionImpl<?>, Set<JobPrototype>>();
    // Find the set of GBKs that DO NOT depend on any other GBK.
    Set<PGroupedTableImpl<?, ?>> workingGroupings = null;
    while (!(workingGroupings = getWorkingGroupings(nodePaths)).isEmpty()) {
        for (PGroupedTableImpl<?, ?> grouping : workingGroupings) {
            Set<NodePath> mapInputPaths = nodePaths.get(grouping);
            JobPrototype proto = JobPrototype.createMapReduceJob(grouping, mapInputPaths, pipeline.createTempPath());
            assignments.put(grouping, proto);
            if (jobDependencies.containsKey(grouping)) {
                for (JobPrototype dependency : jobDependencies.get(grouping)) {
                    proto.addDependency(dependency);
                }
            }
        }
        Map<PGroupedTableImpl<?, ?>, Set<NodePath>> dependencyPaths = getDependencyPaths(workingGroupings, nodePaths);
        for (Map.Entry<PGroupedTableImpl<?, ?>, Set<NodePath>> entry : dependencyPaths.entrySet()) {
            PGroupedTableImpl<?, ?> grouping = entry.getKey();
            Set<NodePath> currentNodePaths = entry.getValue();
            JobPrototype proto = assignments.get(grouping);
            Set<NodePath> gbkPaths = Sets.newHashSet();
            for (NodePath nodePath : currentNodePaths) {
                PCollectionImpl<?> tail = nodePath.tail();
                if (tail instanceof PGroupedTableImpl) {
                    gbkPaths.add(nodePath);
                    if (!jobDependencies.containsKey(tail)) {
                        jobDependencies.put(tail, Sets.<JobPrototype>newHashSet());
                    }
                    jobDependencies.get(tail).add(proto);
                }
            }
            if (!gbkPaths.isEmpty()) {
                handleGroupingDependencies(gbkPaths, currentNodePaths);
            }
            // At this point, all of the dependencies for the working groups will be
            // file outputs, and so we can add them all to the JobPrototype-- we now have
            // a complete job.
            HashMultimap<Target, NodePath> reduceOutputs = HashMultimap.create();
            for (NodePath nodePath : currentNodePaths) {
                assignments.put(nodePath.tail(), proto);
                for (Target target : outputs.get(nodePath.tail())) {
                    reduceOutputs.put(target, nodePath);
                }
            }
            proto.addReducePaths(reduceOutputs);
            // We've processed this GBK-- remove it from the set of nodePaths we
            // need to process in the next step.
            nodePaths.remove(grouping);
        }
    }
    // Process any map-only jobs that are remaining.
    if (!nodePaths.isEmpty()) {
        for (Map.Entry<PCollectionImpl<?>, Set<NodePath>> entry : nodePaths.entrySet()) {
            PCollectionImpl<?> collect = entry.getKey();
            if (!assignments.containsKey(collect)) {
                HashMultimap<Target, NodePath> mapOutputs = HashMultimap.create();
                for (NodePath nodePath : entry.getValue()) {
                    for (Target target : outputs.get(nodePath.tail())) {
                        mapOutputs.put(target, nodePath);
                    }
                }
                JobPrototype proto = JobPrototype.createMapOnlyJob(mapOutputs, pipeline.createTempPath());
                if (jobDependencies.containsKey(collect)) {
                    for (JobPrototype dependency : jobDependencies.get(collect)) {
                        proto.addDependency(dependency);
                    }
                }
                assignments.put(collect, proto);
            }
        }
    }
    MRExecutor exec = new MRExecutor(jarClass);
    for (JobPrototype proto : Sets.newHashSet(assignments.values())) {
        exec.addJob(proto.getCrunchJob(jarClass, conf, pipeline));
    }
    return exec;
}
Also used : Set(java.util.Set) HashMap(java.util.HashMap) PCollectionImpl(org.apache.crunch.impl.mr.collect.PCollectionImpl) Target(org.apache.crunch.Target) SourceTarget(org.apache.crunch.SourceTarget) PGroupedTableImpl(org.apache.crunch.impl.mr.collect.PGroupedTableImpl) MRExecutor(org.apache.crunch.impl.mr.exec.MRExecutor) HashMap(java.util.HashMap) TreeMap(java.util.TreeMap) Map(java.util.Map)

Aggregations

HashMap (java.util.HashMap)1 Map (java.util.Map)1 Set (java.util.Set)1 TreeMap (java.util.TreeMap)1 SourceTarget (org.apache.crunch.SourceTarget)1 Target (org.apache.crunch.Target)1 PCollectionImpl (org.apache.crunch.impl.mr.collect.PCollectionImpl)1 PGroupedTableImpl (org.apache.crunch.impl.mr.collect.PGroupedTableImpl)1 MRExecutor (org.apache.crunch.impl.mr.exec.MRExecutor)1