Search in sources :

Example 1 with CrunchJob

use of org.apache.crunch.impl.mr.exec.CrunchJob in project crunch by cloudera.

the class JobPrototype method build.

private CrunchJob build(Class<?> jarClass, Configuration conf, Pipeline pipeline) throws IOException {
    Job job = new Job(conf);
    conf = job.getConfiguration();
    conf.set(PlanningParameters.CRUNCH_WORKING_DIRECTORY, workingPath.toString());
    job.setJarByClass(jarClass);
    Set<DoNode> outputNodes = Sets.newHashSet();
    Set<Target> targets = targetsToNodePaths.keySet();
    Path outputPath = new Path(workingPath, "output");
    MSCROutputHandler outputHandler = new MSCROutputHandler(job, outputPath, group == null);
    for (Target target : targets) {
        DoNode node = null;
        for (NodePath nodePath : targetsToNodePaths.get(target)) {
            if (node == null) {
                PCollectionImpl<?> collect = nodePath.tail();
                node = DoNode.createOutputNode(target.toString(), collect.getPType());
                outputHandler.configureNode(node, target);
            }
            outputNodes.add(walkPath(nodePath.descendingIterator(), node));
        }
    }
    job.setMapperClass(CrunchMapper.class);
    List<DoNode> inputNodes;
    DoNode reduceNode = null;
    if (group != null) {
        job.setReducerClass(CrunchReducer.class);
        List<DoNode> reduceNodes = Lists.newArrayList(outputNodes);
        serialize(reduceNodes, conf, workingPath, NodeContext.REDUCE);
        reduceNode = reduceNodes.get(0);
        if (combineFnTable != null) {
            job.setCombinerClass(CrunchCombiner.class);
            DoNode combinerInputNode = group.createDoNode();
            DoNode combineNode = combineFnTable.createDoNode();
            combineNode.addChild(group.getGroupingNode());
            combinerInputNode.addChild(combineNode);
            serialize(ImmutableList.of(combinerInputNode), conf, workingPath, NodeContext.COMBINE);
        }
        group.configureShuffle(job);
        DoNode mapOutputNode = group.getGroupingNode();
        Set<DoNode> mapNodes = Sets.newHashSet();
        for (NodePath nodePath : mapNodePaths) {
            // Advance these one step, since we've already configured
            // the grouping node, and the PGroupedTableImpl is the tail
            // of the NodePath.
            Iterator<PCollectionImpl<?>> iter = nodePath.descendingIterator();
            iter.next();
            mapNodes.add(walkPath(iter, mapOutputNode));
        }
        inputNodes = Lists.newArrayList(mapNodes);
    } else {
        // No grouping
        job.setNumReduceTasks(0);
        inputNodes = Lists.newArrayList(outputNodes);
    }
    serialize(inputNodes, conf, workingPath, NodeContext.MAP);
    if (inputNodes.size() == 1) {
        DoNode inputNode = inputNodes.get(0);
        inputNode.getSource().configureSource(job, -1);
    } else {
        for (int i = 0; i < inputNodes.size(); i++) {
            DoNode inputNode = inputNodes.get(i);
            inputNode.getSource().configureSource(job, i);
        }
        job.setInputFormatClass(CrunchInputFormat.class);
    }
    job.setJobName(createJobName(pipeline.getName(), inputNodes, reduceNode));
    return new CrunchJob(job, outputPath, outputHandler);
}
Also used : Path(org.apache.hadoop.fs.Path) CrunchJob(org.apache.crunch.impl.mr.exec.CrunchJob) Target(org.apache.crunch.Target) PCollectionImpl(org.apache.crunch.impl.mr.collect.PCollectionImpl) CrunchJob(org.apache.crunch.impl.mr.exec.CrunchJob) Job(org.apache.hadoop.mapreduce.Job)

Aggregations

Target (org.apache.crunch.Target)1 PCollectionImpl (org.apache.crunch.impl.mr.collect.PCollectionImpl)1 CrunchJob (org.apache.crunch.impl.mr.exec.CrunchJob)1 Path (org.apache.hadoop.fs.Path)1 Job (org.apache.hadoop.mapreduce.Job)1