Search in sources :

Example 1 with Target

use of org.apache.crunch.Target in project crunch by cloudera.

the class MSCRPlanner method handleGroupingDependencies.

private void handleGroupingDependencies(Set<NodePath> gbkPaths, Set<NodePath> currentNodePaths) throws IOException {
    int splitIndex = getSplitIndex(currentNodePaths);
    PCollectionImpl<?> splitTarget = currentNodePaths.iterator().next().get(splitIndex);
    if (!outputs.containsKey(splitTarget)) {
        outputs.put(splitTarget, Sets.<Target>newHashSet());
    }
    SourceTarget srcTarget = null;
    Target targetToReplace = null;
    for (Target t : outputs.get(splitTarget)) {
        if (t instanceof SourceTarget) {
            srcTarget = (SourceTarget<?>) t;
            break;
        } else {
            srcTarget = t.asSourceTarget(splitTarget.getPType());
            if (srcTarget != null) {
                targetToReplace = t;
                break;
            }
        }
    }
    if (targetToReplace != null) {
        outputs.get(splitTarget).remove(targetToReplace);
    } else if (srcTarget == null) {
        srcTarget = pipeline.createIntermediateOutput(splitTarget.getPType());
    }
    outputs.get(splitTarget).add(srcTarget);
    splitTarget.materializeAt(srcTarget);
    PCollectionImpl<?> inputNode = (PCollectionImpl<?>) pipeline.read(srcTarget);
    Set<NodePath> nextNodePaths = Sets.newHashSet();
    for (NodePath nodePath : currentNodePaths) {
        if (gbkPaths.contains(nodePath)) {
            nextNodePaths.add(nodePath.splitAt(splitIndex, inputNode));
        } else {
            nextNodePaths.add(nodePath);
        }
    }
    currentNodePaths.clear();
    currentNodePaths.addAll(nextNodePaths);
}
Also used : Target(org.apache.crunch.Target) SourceTarget(org.apache.crunch.SourceTarget) PCollectionImpl(org.apache.crunch.impl.mr.collect.PCollectionImpl) SourceTarget(org.apache.crunch.SourceTarget)

Example 2 with Target

use of org.apache.crunch.Target in project crunch by cloudera.

the class JobPrototype method build.

private CrunchJob build(Class<?> jarClass, Configuration conf, Pipeline pipeline) throws IOException {
    Job job = new Job(conf);
    conf = job.getConfiguration();
    conf.set(PlanningParameters.CRUNCH_WORKING_DIRECTORY, workingPath.toString());
    job.setJarByClass(jarClass);
    Set<DoNode> outputNodes = Sets.newHashSet();
    Set<Target> targets = targetsToNodePaths.keySet();
    Path outputPath = new Path(workingPath, "output");
    MSCROutputHandler outputHandler = new MSCROutputHandler(job, outputPath, group == null);
    for (Target target : targets) {
        DoNode node = null;
        for (NodePath nodePath : targetsToNodePaths.get(target)) {
            if (node == null) {
                PCollectionImpl<?> collect = nodePath.tail();
                node = DoNode.createOutputNode(target.toString(), collect.getPType());
                outputHandler.configureNode(node, target);
            }
            outputNodes.add(walkPath(nodePath.descendingIterator(), node));
        }
    }
    job.setMapperClass(CrunchMapper.class);
    List<DoNode> inputNodes;
    DoNode reduceNode = null;
    if (group != null) {
        job.setReducerClass(CrunchReducer.class);
        List<DoNode> reduceNodes = Lists.newArrayList(outputNodes);
        serialize(reduceNodes, conf, workingPath, NodeContext.REDUCE);
        reduceNode = reduceNodes.get(0);
        if (combineFnTable != null) {
            job.setCombinerClass(CrunchCombiner.class);
            DoNode combinerInputNode = group.createDoNode();
            DoNode combineNode = combineFnTable.createDoNode();
            combineNode.addChild(group.getGroupingNode());
            combinerInputNode.addChild(combineNode);
            serialize(ImmutableList.of(combinerInputNode), conf, workingPath, NodeContext.COMBINE);
        }
        group.configureShuffle(job);
        DoNode mapOutputNode = group.getGroupingNode();
        Set<DoNode> mapNodes = Sets.newHashSet();
        for (NodePath nodePath : mapNodePaths) {
            // Advance these one step, since we've already configured
            // the grouping node, and the PGroupedTableImpl is the tail
            // of the NodePath.
            Iterator<PCollectionImpl<?>> iter = nodePath.descendingIterator();
            iter.next();
            mapNodes.add(walkPath(iter, mapOutputNode));
        }
        inputNodes = Lists.newArrayList(mapNodes);
    } else {
        // No grouping
        job.setNumReduceTasks(0);
        inputNodes = Lists.newArrayList(outputNodes);
    }
    serialize(inputNodes, conf, workingPath, NodeContext.MAP);
    if (inputNodes.size() == 1) {
        DoNode inputNode = inputNodes.get(0);
        inputNode.getSource().configureSource(job, -1);
    } else {
        for (int i = 0; i < inputNodes.size(); i++) {
            DoNode inputNode = inputNodes.get(i);
            inputNode.getSource().configureSource(job, i);
        }
        job.setInputFormatClass(CrunchInputFormat.class);
    }
    job.setJobName(createJobName(pipeline.getName(), inputNodes, reduceNode));
    return new CrunchJob(job, outputPath, outputHandler);
}
Also used : Path(org.apache.hadoop.fs.Path) CrunchJob(org.apache.crunch.impl.mr.exec.CrunchJob) Target(org.apache.crunch.Target) PCollectionImpl(org.apache.crunch.impl.mr.collect.PCollectionImpl) CrunchJob(org.apache.crunch.impl.mr.exec.CrunchJob) Job(org.apache.hadoop.mapreduce.Job)

Example 3 with Target

use of org.apache.crunch.Target in project crunch by cloudera.

the class MSCRPlanner method plan.

public MRExecutor plan(Class<?> jarClass, Configuration conf) throws IOException {
    // Constructs all of the node paths, which either start w/an input
    // or a GBK and terminate in an output collection of any type.
    NodeVisitor visitor = new NodeVisitor();
    for (PCollectionImpl<?> output : outputs.keySet()) {
        visitor.visitOutput(output);
    }
    // Pull out the node paths.
    Map<PCollectionImpl<?>, Set<NodePath>> nodePaths = visitor.getNodePaths();
    // Keeps track of the dependencies from collections -> jobs and then
    // between different jobs.
    Map<PCollectionImpl<?>, JobPrototype> assignments = Maps.newHashMap();
    Map<PCollectionImpl<?>, Set<JobPrototype>> jobDependencies = new HashMap<PCollectionImpl<?>, Set<JobPrototype>>();
    // Find the set of GBKs that DO NOT depend on any other GBK.
    Set<PGroupedTableImpl<?, ?>> workingGroupings = null;
    while (!(workingGroupings = getWorkingGroupings(nodePaths)).isEmpty()) {
        for (PGroupedTableImpl<?, ?> grouping : workingGroupings) {
            Set<NodePath> mapInputPaths = nodePaths.get(grouping);
            JobPrototype proto = JobPrototype.createMapReduceJob(grouping, mapInputPaths, pipeline.createTempPath());
            assignments.put(grouping, proto);
            if (jobDependencies.containsKey(grouping)) {
                for (JobPrototype dependency : jobDependencies.get(grouping)) {
                    proto.addDependency(dependency);
                }
            }
        }
        Map<PGroupedTableImpl<?, ?>, Set<NodePath>> dependencyPaths = getDependencyPaths(workingGroupings, nodePaths);
        for (Map.Entry<PGroupedTableImpl<?, ?>, Set<NodePath>> entry : dependencyPaths.entrySet()) {
            PGroupedTableImpl<?, ?> grouping = entry.getKey();
            Set<NodePath> currentNodePaths = entry.getValue();
            JobPrototype proto = assignments.get(grouping);
            Set<NodePath> gbkPaths = Sets.newHashSet();
            for (NodePath nodePath : currentNodePaths) {
                PCollectionImpl<?> tail = nodePath.tail();
                if (tail instanceof PGroupedTableImpl) {
                    gbkPaths.add(nodePath);
                    if (!jobDependencies.containsKey(tail)) {
                        jobDependencies.put(tail, Sets.<JobPrototype>newHashSet());
                    }
                    jobDependencies.get(tail).add(proto);
                }
            }
            if (!gbkPaths.isEmpty()) {
                handleGroupingDependencies(gbkPaths, currentNodePaths);
            }
            // At this point, all of the dependencies for the working groups will be
            // file outputs, and so we can add them all to the JobPrototype-- we now have
            // a complete job.
            HashMultimap<Target, NodePath> reduceOutputs = HashMultimap.create();
            for (NodePath nodePath : currentNodePaths) {
                assignments.put(nodePath.tail(), proto);
                for (Target target : outputs.get(nodePath.tail())) {
                    reduceOutputs.put(target, nodePath);
                }
            }
            proto.addReducePaths(reduceOutputs);
            // We've processed this GBK-- remove it from the set of nodePaths we
            // need to process in the next step.
            nodePaths.remove(grouping);
        }
    }
    // Process any map-only jobs that are remaining.
    if (!nodePaths.isEmpty()) {
        for (Map.Entry<PCollectionImpl<?>, Set<NodePath>> entry : nodePaths.entrySet()) {
            PCollectionImpl<?> collect = entry.getKey();
            if (!assignments.containsKey(collect)) {
                HashMultimap<Target, NodePath> mapOutputs = HashMultimap.create();
                for (NodePath nodePath : entry.getValue()) {
                    for (Target target : outputs.get(nodePath.tail())) {
                        mapOutputs.put(target, nodePath);
                    }
                }
                JobPrototype proto = JobPrototype.createMapOnlyJob(mapOutputs, pipeline.createTempPath());
                if (jobDependencies.containsKey(collect)) {
                    for (JobPrototype dependency : jobDependencies.get(collect)) {
                        proto.addDependency(dependency);
                    }
                }
                assignments.put(collect, proto);
            }
        }
    }
    MRExecutor exec = new MRExecutor(jarClass);
    for (JobPrototype proto : Sets.newHashSet(assignments.values())) {
        exec.addJob(proto.getCrunchJob(jarClass, conf, pipeline));
    }
    return exec;
}
Also used : Set(java.util.Set) HashMap(java.util.HashMap) PCollectionImpl(org.apache.crunch.impl.mr.collect.PCollectionImpl) Target(org.apache.crunch.Target) SourceTarget(org.apache.crunch.SourceTarget) PGroupedTableImpl(org.apache.crunch.impl.mr.collect.PGroupedTableImpl) MRExecutor(org.apache.crunch.impl.mr.exec.MRExecutor) HashMap(java.util.HashMap) TreeMap(java.util.TreeMap) Map(java.util.Map)

Example 4 with Target

use of org.apache.crunch.Target in project crunch by cloudera.

the class MRPipeline method run.

@Override
public PipelineResult run() {
    MSCRPlanner planner = new MSCRPlanner(this, outputTargets);
    PipelineResult res = null;
    try {
        res = planner.plan(jarClass, conf).execute();
    } catch (IOException e) {
        LOG.error(e);
        return PipelineResult.EMPTY;
    }
    for (PCollectionImpl<?> c : outputTargets.keySet()) {
        if (outputTargetsToMaterialize.containsKey(c)) {
            MaterializableIterable iter = outputTargetsToMaterialize.get(c);
            iter.materialize();
            c.materializeAt(iter.getSourceTarget());
            outputTargetsToMaterialize.remove(c);
        } else {
            boolean materialized = false;
            for (Target t : outputTargets.get(c)) {
                if (!materialized && t instanceof Source) {
                    c.materializeAt((SourceTarget) t);
                    materialized = true;
                }
            }
        }
    }
    outputTargets.clear();
    return res;
}
Also used : SourceTarget(org.apache.crunch.SourceTarget) Target(org.apache.crunch.Target) ReadableSourceTarget(org.apache.crunch.io.ReadableSourceTarget) MSCRPlanner(org.apache.crunch.impl.mr.plan.MSCRPlanner) PipelineResult(org.apache.crunch.PipelineResult) IOException(java.io.IOException) Source(org.apache.crunch.Source) TableSource(org.apache.crunch.TableSource) MaterializableIterable(org.apache.crunch.materialize.MaterializableIterable)

Example 5 with Target

use of org.apache.crunch.Target in project crunch by cloudera.

the class MRPipeline method getMaterializeSourceTarget.

/**
 * Retrieve a ReadableSourceTarget that provides access to the contents of a
 * {@link PCollection}. This is primarily intended as a helper method to
 * {@link #materialize(PCollection)}. The underlying data of the
 * ReadableSourceTarget may not be actually present until the pipeline is run.
 *
 * @param pcollection
 *          The collection for which the ReadableSourceTarget is to be
 *          retrieved
 * @return The ReadableSourceTarget
 * @throws IllegalArgumentException
 *           If no ReadableSourceTarget can be retrieved for the given
 *           PCollection
 */
public <T> ReadableSourceTarget<T> getMaterializeSourceTarget(PCollection<T> pcollection) {
    PCollectionImpl<T> impl = toPcollectionImpl(pcollection);
    SourceTarget<T> matTarget = impl.getMaterializedAt();
    if (matTarget != null && matTarget instanceof ReadableSourceTarget) {
        return (ReadableSourceTarget<T>) matTarget;
    }
    ReadableSourceTarget<T> srcTarget = null;
    if (outputTargets.containsKey(pcollection)) {
        for (Target target : outputTargets.get(impl)) {
            if (target instanceof ReadableSourceTarget) {
                srcTarget = (ReadableSourceTarget<T>) target;
                break;
            }
        }
    }
    if (srcTarget == null) {
        SourceTarget<T> st = createIntermediateOutput(pcollection.getPType());
        if (!(st instanceof ReadableSourceTarget)) {
            throw new IllegalArgumentException("The PType for the given PCollection is not readable" + " and cannot be materialized");
        } else {
            srcTarget = (ReadableSourceTarget<T>) st;
            addOutput(impl, srcTarget);
        }
    }
    return srcTarget;
}
Also used : SourceTarget(org.apache.crunch.SourceTarget) Target(org.apache.crunch.Target) ReadableSourceTarget(org.apache.crunch.io.ReadableSourceTarget) ReadableSourceTarget(org.apache.crunch.io.ReadableSourceTarget)

Aggregations

Target (org.apache.crunch.Target)5 SourceTarget (org.apache.crunch.SourceTarget)4 PCollectionImpl (org.apache.crunch.impl.mr.collect.PCollectionImpl)3 ReadableSourceTarget (org.apache.crunch.io.ReadableSourceTarget)2 IOException (java.io.IOException)1 HashMap (java.util.HashMap)1 Map (java.util.Map)1 Set (java.util.Set)1 TreeMap (java.util.TreeMap)1 PipelineResult (org.apache.crunch.PipelineResult)1 Source (org.apache.crunch.Source)1 TableSource (org.apache.crunch.TableSource)1 PGroupedTableImpl (org.apache.crunch.impl.mr.collect.PGroupedTableImpl)1 CrunchJob (org.apache.crunch.impl.mr.exec.CrunchJob)1 MRExecutor (org.apache.crunch.impl.mr.exec.MRExecutor)1 MSCRPlanner (org.apache.crunch.impl.mr.plan.MSCRPlanner)1 MaterializableIterable (org.apache.crunch.materialize.MaterializableIterable)1 Path (org.apache.hadoop.fs.Path)1 Job (org.apache.hadoop.mapreduce.Job)1