use of org.apache.crunch.SourceTarget in project crunch by cloudera.
the class MSCRPlanner method handleGroupingDependencies.
private void handleGroupingDependencies(Set<NodePath> gbkPaths, Set<NodePath> currentNodePaths) throws IOException {
int splitIndex = getSplitIndex(currentNodePaths);
PCollectionImpl<?> splitTarget = currentNodePaths.iterator().next().get(splitIndex);
if (!outputs.containsKey(splitTarget)) {
outputs.put(splitTarget, Sets.<Target>newHashSet());
}
SourceTarget srcTarget = null;
Target targetToReplace = null;
for (Target t : outputs.get(splitTarget)) {
if (t instanceof SourceTarget) {
srcTarget = (SourceTarget<?>) t;
break;
} else {
srcTarget = t.asSourceTarget(splitTarget.getPType());
if (srcTarget != null) {
targetToReplace = t;
break;
}
}
}
if (targetToReplace != null) {
outputs.get(splitTarget).remove(targetToReplace);
} else if (srcTarget == null) {
srcTarget = pipeline.createIntermediateOutput(splitTarget.getPType());
}
outputs.get(splitTarget).add(srcTarget);
splitTarget.materializeAt(srcTarget);
PCollectionImpl<?> inputNode = (PCollectionImpl<?>) pipeline.read(srcTarget);
Set<NodePath> nextNodePaths = Sets.newHashSet();
for (NodePath nodePath : currentNodePaths) {
if (gbkPaths.contains(nodePath)) {
nextNodePaths.add(nodePath.splitAt(splitIndex, inputNode));
} else {
nextNodePaths.add(nodePath);
}
}
currentNodePaths.clear();
currentNodePaths.addAll(nextNodePaths);
}
use of org.apache.crunch.SourceTarget in project crunch by cloudera.
the class MRPipeline method getMaterializeSourceTarget.
/**
* Retrieve a ReadableSourceTarget that provides access to the contents of a
* {@link PCollection}. This is primarily intended as a helper method to
* {@link #materialize(PCollection)}. The underlying data of the
* ReadableSourceTarget may not be actually present until the pipeline is run.
*
* @param pcollection
* The collection for which the ReadableSourceTarget is to be
* retrieved
* @return The ReadableSourceTarget
* @throws IllegalArgumentException
* If no ReadableSourceTarget can be retrieved for the given
* PCollection
*/
public <T> ReadableSourceTarget<T> getMaterializeSourceTarget(PCollection<T> pcollection) {
PCollectionImpl<T> impl = toPcollectionImpl(pcollection);
SourceTarget<T> matTarget = impl.getMaterializedAt();
if (matTarget != null && matTarget instanceof ReadableSourceTarget) {
return (ReadableSourceTarget<T>) matTarget;
}
ReadableSourceTarget<T> srcTarget = null;
if (outputTargets.containsKey(pcollection)) {
for (Target target : outputTargets.get(impl)) {
if (target instanceof ReadableSourceTarget) {
srcTarget = (ReadableSourceTarget<T>) target;
break;
}
}
}
if (srcTarget == null) {
SourceTarget<T> st = createIntermediateOutput(pcollection.getPType());
if (!(st instanceof ReadableSourceTarget)) {
throw new IllegalArgumentException("The PType for the given PCollection is not readable" + " and cannot be materialized");
} else {
srcTarget = (ReadableSourceTarget<T>) st;
addOutput(impl, srcTarget);
}
}
return srcTarget;
}
use of org.apache.crunch.SourceTarget in project crunch by cloudera.
the class MRPipeline method run.
@Override
public PipelineResult run() {
MSCRPlanner planner = new MSCRPlanner(this, outputTargets);
PipelineResult res = null;
try {
res = planner.plan(jarClass, conf).execute();
} catch (IOException e) {
LOG.error(e);
return PipelineResult.EMPTY;
}
for (PCollectionImpl<?> c : outputTargets.keySet()) {
if (outputTargetsToMaterialize.containsKey(c)) {
MaterializableIterable iter = outputTargetsToMaterialize.get(c);
iter.materialize();
c.materializeAt(iter.getSourceTarget());
outputTargetsToMaterialize.remove(c);
} else {
boolean materialized = false;
for (Target t : outputTargets.get(c)) {
if (!materialized && t instanceof Source) {
c.materializeAt((SourceTarget) t);
materialized = true;
}
}
}
}
outputTargets.clear();
return res;
}
Aggregations