Search in sources :

Example 1 with PCollectionImpl

use of org.apache.crunch.impl.mr.collect.PCollectionImpl in project crunch by cloudera.

the class MSCRPlanner method handleGroupingDependencies.

private void handleGroupingDependencies(Set<NodePath> gbkPaths, Set<NodePath> currentNodePaths) throws IOException {
    int splitIndex = getSplitIndex(currentNodePaths);
    PCollectionImpl<?> splitTarget = currentNodePaths.iterator().next().get(splitIndex);
    if (!outputs.containsKey(splitTarget)) {
        outputs.put(splitTarget, Sets.<Target>newHashSet());
    }
    SourceTarget srcTarget = null;
    Target targetToReplace = null;
    for (Target t : outputs.get(splitTarget)) {
        if (t instanceof SourceTarget) {
            srcTarget = (SourceTarget<?>) t;
            break;
        } else {
            srcTarget = t.asSourceTarget(splitTarget.getPType());
            if (srcTarget != null) {
                targetToReplace = t;
                break;
            }
        }
    }
    if (targetToReplace != null) {
        outputs.get(splitTarget).remove(targetToReplace);
    } else if (srcTarget == null) {
        srcTarget = pipeline.createIntermediateOutput(splitTarget.getPType());
    }
    outputs.get(splitTarget).add(srcTarget);
    splitTarget.materializeAt(srcTarget);
    PCollectionImpl<?> inputNode = (PCollectionImpl<?>) pipeline.read(srcTarget);
    Set<NodePath> nextNodePaths = Sets.newHashSet();
    for (NodePath nodePath : currentNodePaths) {
        if (gbkPaths.contains(nodePath)) {
            nextNodePaths.add(nodePath.splitAt(splitIndex, inputNode));
        } else {
            nextNodePaths.add(nodePath);
        }
    }
    currentNodePaths.clear();
    currentNodePaths.addAll(nextNodePaths);
}
Also used : Target(org.apache.crunch.Target) SourceTarget(org.apache.crunch.SourceTarget) PCollectionImpl(org.apache.crunch.impl.mr.collect.PCollectionImpl) SourceTarget(org.apache.crunch.SourceTarget)

Example 2 with PCollectionImpl

use of org.apache.crunch.impl.mr.collect.PCollectionImpl in project crunch by cloudera.

the class JobPrototype method build.

private CrunchJob build(Class<?> jarClass, Configuration conf, Pipeline pipeline) throws IOException {
    Job job = new Job(conf);
    conf = job.getConfiguration();
    conf.set(PlanningParameters.CRUNCH_WORKING_DIRECTORY, workingPath.toString());
    job.setJarByClass(jarClass);
    Set<DoNode> outputNodes = Sets.newHashSet();
    Set<Target> targets = targetsToNodePaths.keySet();
    Path outputPath = new Path(workingPath, "output");
    MSCROutputHandler outputHandler = new MSCROutputHandler(job, outputPath, group == null);
    for (Target target : targets) {
        DoNode node = null;
        for (NodePath nodePath : targetsToNodePaths.get(target)) {
            if (node == null) {
                PCollectionImpl<?> collect = nodePath.tail();
                node = DoNode.createOutputNode(target.toString(), collect.getPType());
                outputHandler.configureNode(node, target);
            }
            outputNodes.add(walkPath(nodePath.descendingIterator(), node));
        }
    }
    job.setMapperClass(CrunchMapper.class);
    List<DoNode> inputNodes;
    DoNode reduceNode = null;
    if (group != null) {
        job.setReducerClass(CrunchReducer.class);
        List<DoNode> reduceNodes = Lists.newArrayList(outputNodes);
        serialize(reduceNodes, conf, workingPath, NodeContext.REDUCE);
        reduceNode = reduceNodes.get(0);
        if (combineFnTable != null) {
            job.setCombinerClass(CrunchCombiner.class);
            DoNode combinerInputNode = group.createDoNode();
            DoNode combineNode = combineFnTable.createDoNode();
            combineNode.addChild(group.getGroupingNode());
            combinerInputNode.addChild(combineNode);
            serialize(ImmutableList.of(combinerInputNode), conf, workingPath, NodeContext.COMBINE);
        }
        group.configureShuffle(job);
        DoNode mapOutputNode = group.getGroupingNode();
        Set<DoNode> mapNodes = Sets.newHashSet();
        for (NodePath nodePath : mapNodePaths) {
            // Advance these one step, since we've already configured
            // the grouping node, and the PGroupedTableImpl is the tail
            // of the NodePath.
            Iterator<PCollectionImpl<?>> iter = nodePath.descendingIterator();
            iter.next();
            mapNodes.add(walkPath(iter, mapOutputNode));
        }
        inputNodes = Lists.newArrayList(mapNodes);
    } else {
        // No grouping
        job.setNumReduceTasks(0);
        inputNodes = Lists.newArrayList(outputNodes);
    }
    serialize(inputNodes, conf, workingPath, NodeContext.MAP);
    if (inputNodes.size() == 1) {
        DoNode inputNode = inputNodes.get(0);
        inputNode.getSource().configureSource(job, -1);
    } else {
        for (int i = 0; i < inputNodes.size(); i++) {
            DoNode inputNode = inputNodes.get(i);
            inputNode.getSource().configureSource(job, i);
        }
        job.setInputFormatClass(CrunchInputFormat.class);
    }
    job.setJobName(createJobName(pipeline.getName(), inputNodes, reduceNode));
    return new CrunchJob(job, outputPath, outputHandler);
}
Also used : Path(org.apache.hadoop.fs.Path) CrunchJob(org.apache.crunch.impl.mr.exec.CrunchJob) Target(org.apache.crunch.Target) PCollectionImpl(org.apache.crunch.impl.mr.collect.PCollectionImpl) CrunchJob(org.apache.crunch.impl.mr.exec.CrunchJob) Job(org.apache.hadoop.mapreduce.Job)

Example 3 with PCollectionImpl

use of org.apache.crunch.impl.mr.collect.PCollectionImpl in project crunch by cloudera.

the class MSCRPlanner method plan.

public MRExecutor plan(Class<?> jarClass, Configuration conf) throws IOException {
    // Constructs all of the node paths, which either start w/an input
    // or a GBK and terminate in an output collection of any type.
    NodeVisitor visitor = new NodeVisitor();
    for (PCollectionImpl<?> output : outputs.keySet()) {
        visitor.visitOutput(output);
    }
    // Pull out the node paths.
    Map<PCollectionImpl<?>, Set<NodePath>> nodePaths = visitor.getNodePaths();
    // Keeps track of the dependencies from collections -> jobs and then
    // between different jobs.
    Map<PCollectionImpl<?>, JobPrototype> assignments = Maps.newHashMap();
    Map<PCollectionImpl<?>, Set<JobPrototype>> jobDependencies = new HashMap<PCollectionImpl<?>, Set<JobPrototype>>();
    // Find the set of GBKs that DO NOT depend on any other GBK.
    Set<PGroupedTableImpl<?, ?>> workingGroupings = null;
    while (!(workingGroupings = getWorkingGroupings(nodePaths)).isEmpty()) {
        for (PGroupedTableImpl<?, ?> grouping : workingGroupings) {
            Set<NodePath> mapInputPaths = nodePaths.get(grouping);
            JobPrototype proto = JobPrototype.createMapReduceJob(grouping, mapInputPaths, pipeline.createTempPath());
            assignments.put(grouping, proto);
            if (jobDependencies.containsKey(grouping)) {
                for (JobPrototype dependency : jobDependencies.get(grouping)) {
                    proto.addDependency(dependency);
                }
            }
        }
        Map<PGroupedTableImpl<?, ?>, Set<NodePath>> dependencyPaths = getDependencyPaths(workingGroupings, nodePaths);
        for (Map.Entry<PGroupedTableImpl<?, ?>, Set<NodePath>> entry : dependencyPaths.entrySet()) {
            PGroupedTableImpl<?, ?> grouping = entry.getKey();
            Set<NodePath> currentNodePaths = entry.getValue();
            JobPrototype proto = assignments.get(grouping);
            Set<NodePath> gbkPaths = Sets.newHashSet();
            for (NodePath nodePath : currentNodePaths) {
                PCollectionImpl<?> tail = nodePath.tail();
                if (tail instanceof PGroupedTableImpl) {
                    gbkPaths.add(nodePath);
                    if (!jobDependencies.containsKey(tail)) {
                        jobDependencies.put(tail, Sets.<JobPrototype>newHashSet());
                    }
                    jobDependencies.get(tail).add(proto);
                }
            }
            if (!gbkPaths.isEmpty()) {
                handleGroupingDependencies(gbkPaths, currentNodePaths);
            }
            // At this point, all of the dependencies for the working groups will be
            // file outputs, and so we can add them all to the JobPrototype-- we now have
            // a complete job.
            HashMultimap<Target, NodePath> reduceOutputs = HashMultimap.create();
            for (NodePath nodePath : currentNodePaths) {
                assignments.put(nodePath.tail(), proto);
                for (Target target : outputs.get(nodePath.tail())) {
                    reduceOutputs.put(target, nodePath);
                }
            }
            proto.addReducePaths(reduceOutputs);
            // We've processed this GBK-- remove it from the set of nodePaths we
            // need to process in the next step.
            nodePaths.remove(grouping);
        }
    }
    // Process any map-only jobs that are remaining.
    if (!nodePaths.isEmpty()) {
        for (Map.Entry<PCollectionImpl<?>, Set<NodePath>> entry : nodePaths.entrySet()) {
            PCollectionImpl<?> collect = entry.getKey();
            if (!assignments.containsKey(collect)) {
                HashMultimap<Target, NodePath> mapOutputs = HashMultimap.create();
                for (NodePath nodePath : entry.getValue()) {
                    for (Target target : outputs.get(nodePath.tail())) {
                        mapOutputs.put(target, nodePath);
                    }
                }
                JobPrototype proto = JobPrototype.createMapOnlyJob(mapOutputs, pipeline.createTempPath());
                if (jobDependencies.containsKey(collect)) {
                    for (JobPrototype dependency : jobDependencies.get(collect)) {
                        proto.addDependency(dependency);
                    }
                }
                assignments.put(collect, proto);
            }
        }
    }
    MRExecutor exec = new MRExecutor(jarClass);
    for (JobPrototype proto : Sets.newHashSet(assignments.values())) {
        exec.addJob(proto.getCrunchJob(jarClass, conf, pipeline));
    }
    return exec;
}
Also used : Set(java.util.Set) HashMap(java.util.HashMap) PCollectionImpl(org.apache.crunch.impl.mr.collect.PCollectionImpl) Target(org.apache.crunch.Target) SourceTarget(org.apache.crunch.SourceTarget) PGroupedTableImpl(org.apache.crunch.impl.mr.collect.PGroupedTableImpl) MRExecutor(org.apache.crunch.impl.mr.exec.MRExecutor) HashMap(java.util.HashMap) TreeMap(java.util.TreeMap) Map(java.util.Map)

Aggregations

Target (org.apache.crunch.Target)3 PCollectionImpl (org.apache.crunch.impl.mr.collect.PCollectionImpl)3 SourceTarget (org.apache.crunch.SourceTarget)2 HashMap (java.util.HashMap)1 Map (java.util.Map)1 Set (java.util.Set)1 TreeMap (java.util.TreeMap)1 PGroupedTableImpl (org.apache.crunch.impl.mr.collect.PGroupedTableImpl)1 CrunchJob (org.apache.crunch.impl.mr.exec.CrunchJob)1 MRExecutor (org.apache.crunch.impl.mr.exec.MRExecutor)1 Path (org.apache.hadoop.fs.Path)1 Job (org.apache.hadoop.mapreduce.Job)1