use of org.apache.crunch.impl.mr.collect.PCollectionImpl in project crunch by cloudera.
the class MSCRPlanner method handleGroupingDependencies.
private void handleGroupingDependencies(Set<NodePath> gbkPaths, Set<NodePath> currentNodePaths) throws IOException {
int splitIndex = getSplitIndex(currentNodePaths);
PCollectionImpl<?> splitTarget = currentNodePaths.iterator().next().get(splitIndex);
if (!outputs.containsKey(splitTarget)) {
outputs.put(splitTarget, Sets.<Target>newHashSet());
}
SourceTarget srcTarget = null;
Target targetToReplace = null;
for (Target t : outputs.get(splitTarget)) {
if (t instanceof SourceTarget) {
srcTarget = (SourceTarget<?>) t;
break;
} else {
srcTarget = t.asSourceTarget(splitTarget.getPType());
if (srcTarget != null) {
targetToReplace = t;
break;
}
}
}
if (targetToReplace != null) {
outputs.get(splitTarget).remove(targetToReplace);
} else if (srcTarget == null) {
srcTarget = pipeline.createIntermediateOutput(splitTarget.getPType());
}
outputs.get(splitTarget).add(srcTarget);
splitTarget.materializeAt(srcTarget);
PCollectionImpl<?> inputNode = (PCollectionImpl<?>) pipeline.read(srcTarget);
Set<NodePath> nextNodePaths = Sets.newHashSet();
for (NodePath nodePath : currentNodePaths) {
if (gbkPaths.contains(nodePath)) {
nextNodePaths.add(nodePath.splitAt(splitIndex, inputNode));
} else {
nextNodePaths.add(nodePath);
}
}
currentNodePaths.clear();
currentNodePaths.addAll(nextNodePaths);
}
use of org.apache.crunch.impl.mr.collect.PCollectionImpl in project crunch by cloudera.
the class JobPrototype method build.
private CrunchJob build(Class<?> jarClass, Configuration conf, Pipeline pipeline) throws IOException {
Job job = new Job(conf);
conf = job.getConfiguration();
conf.set(PlanningParameters.CRUNCH_WORKING_DIRECTORY, workingPath.toString());
job.setJarByClass(jarClass);
Set<DoNode> outputNodes = Sets.newHashSet();
Set<Target> targets = targetsToNodePaths.keySet();
Path outputPath = new Path(workingPath, "output");
MSCROutputHandler outputHandler = new MSCROutputHandler(job, outputPath, group == null);
for (Target target : targets) {
DoNode node = null;
for (NodePath nodePath : targetsToNodePaths.get(target)) {
if (node == null) {
PCollectionImpl<?> collect = nodePath.tail();
node = DoNode.createOutputNode(target.toString(), collect.getPType());
outputHandler.configureNode(node, target);
}
outputNodes.add(walkPath(nodePath.descendingIterator(), node));
}
}
job.setMapperClass(CrunchMapper.class);
List<DoNode> inputNodes;
DoNode reduceNode = null;
if (group != null) {
job.setReducerClass(CrunchReducer.class);
List<DoNode> reduceNodes = Lists.newArrayList(outputNodes);
serialize(reduceNodes, conf, workingPath, NodeContext.REDUCE);
reduceNode = reduceNodes.get(0);
if (combineFnTable != null) {
job.setCombinerClass(CrunchCombiner.class);
DoNode combinerInputNode = group.createDoNode();
DoNode combineNode = combineFnTable.createDoNode();
combineNode.addChild(group.getGroupingNode());
combinerInputNode.addChild(combineNode);
serialize(ImmutableList.of(combinerInputNode), conf, workingPath, NodeContext.COMBINE);
}
group.configureShuffle(job);
DoNode mapOutputNode = group.getGroupingNode();
Set<DoNode> mapNodes = Sets.newHashSet();
for (NodePath nodePath : mapNodePaths) {
// Advance these one step, since we've already configured
// the grouping node, and the PGroupedTableImpl is the tail
// of the NodePath.
Iterator<PCollectionImpl<?>> iter = nodePath.descendingIterator();
iter.next();
mapNodes.add(walkPath(iter, mapOutputNode));
}
inputNodes = Lists.newArrayList(mapNodes);
} else {
// No grouping
job.setNumReduceTasks(0);
inputNodes = Lists.newArrayList(outputNodes);
}
serialize(inputNodes, conf, workingPath, NodeContext.MAP);
if (inputNodes.size() == 1) {
DoNode inputNode = inputNodes.get(0);
inputNode.getSource().configureSource(job, -1);
} else {
for (int i = 0; i < inputNodes.size(); i++) {
DoNode inputNode = inputNodes.get(i);
inputNode.getSource().configureSource(job, i);
}
job.setInputFormatClass(CrunchInputFormat.class);
}
job.setJobName(createJobName(pipeline.getName(), inputNodes, reduceNode));
return new CrunchJob(job, outputPath, outputHandler);
}
use of org.apache.crunch.impl.mr.collect.PCollectionImpl in project crunch by cloudera.
the class MSCRPlanner method plan.
public MRExecutor plan(Class<?> jarClass, Configuration conf) throws IOException {
// Constructs all of the node paths, which either start w/an input
// or a GBK and terminate in an output collection of any type.
NodeVisitor visitor = new NodeVisitor();
for (PCollectionImpl<?> output : outputs.keySet()) {
visitor.visitOutput(output);
}
// Pull out the node paths.
Map<PCollectionImpl<?>, Set<NodePath>> nodePaths = visitor.getNodePaths();
// Keeps track of the dependencies from collections -> jobs and then
// between different jobs.
Map<PCollectionImpl<?>, JobPrototype> assignments = Maps.newHashMap();
Map<PCollectionImpl<?>, Set<JobPrototype>> jobDependencies = new HashMap<PCollectionImpl<?>, Set<JobPrototype>>();
// Find the set of GBKs that DO NOT depend on any other GBK.
Set<PGroupedTableImpl<?, ?>> workingGroupings = null;
while (!(workingGroupings = getWorkingGroupings(nodePaths)).isEmpty()) {
for (PGroupedTableImpl<?, ?> grouping : workingGroupings) {
Set<NodePath> mapInputPaths = nodePaths.get(grouping);
JobPrototype proto = JobPrototype.createMapReduceJob(grouping, mapInputPaths, pipeline.createTempPath());
assignments.put(grouping, proto);
if (jobDependencies.containsKey(grouping)) {
for (JobPrototype dependency : jobDependencies.get(grouping)) {
proto.addDependency(dependency);
}
}
}
Map<PGroupedTableImpl<?, ?>, Set<NodePath>> dependencyPaths = getDependencyPaths(workingGroupings, nodePaths);
for (Map.Entry<PGroupedTableImpl<?, ?>, Set<NodePath>> entry : dependencyPaths.entrySet()) {
PGroupedTableImpl<?, ?> grouping = entry.getKey();
Set<NodePath> currentNodePaths = entry.getValue();
JobPrototype proto = assignments.get(grouping);
Set<NodePath> gbkPaths = Sets.newHashSet();
for (NodePath nodePath : currentNodePaths) {
PCollectionImpl<?> tail = nodePath.tail();
if (tail instanceof PGroupedTableImpl) {
gbkPaths.add(nodePath);
if (!jobDependencies.containsKey(tail)) {
jobDependencies.put(tail, Sets.<JobPrototype>newHashSet());
}
jobDependencies.get(tail).add(proto);
}
}
if (!gbkPaths.isEmpty()) {
handleGroupingDependencies(gbkPaths, currentNodePaths);
}
// At this point, all of the dependencies for the working groups will be
// file outputs, and so we can add them all to the JobPrototype-- we now have
// a complete job.
HashMultimap<Target, NodePath> reduceOutputs = HashMultimap.create();
for (NodePath nodePath : currentNodePaths) {
assignments.put(nodePath.tail(), proto);
for (Target target : outputs.get(nodePath.tail())) {
reduceOutputs.put(target, nodePath);
}
}
proto.addReducePaths(reduceOutputs);
// We've processed this GBK-- remove it from the set of nodePaths we
// need to process in the next step.
nodePaths.remove(grouping);
}
}
// Process any map-only jobs that are remaining.
if (!nodePaths.isEmpty()) {
for (Map.Entry<PCollectionImpl<?>, Set<NodePath>> entry : nodePaths.entrySet()) {
PCollectionImpl<?> collect = entry.getKey();
if (!assignments.containsKey(collect)) {
HashMultimap<Target, NodePath> mapOutputs = HashMultimap.create();
for (NodePath nodePath : entry.getValue()) {
for (Target target : outputs.get(nodePath.tail())) {
mapOutputs.put(target, nodePath);
}
}
JobPrototype proto = JobPrototype.createMapOnlyJob(mapOutputs, pipeline.createTempPath());
if (jobDependencies.containsKey(collect)) {
for (JobPrototype dependency : jobDependencies.get(collect)) {
proto.addDependency(dependency);
}
}
assignments.put(collect, proto);
}
}
}
MRExecutor exec = new MRExecutor(jarClass);
for (JobPrototype proto : Sets.newHashSet(assignments.values())) {
exec.addJob(proto.getCrunchJob(jarClass, conf, pipeline));
}
return exec;
}
Aggregations