use of com.tdunning.plume.PCollection in project Plume by tdunning.
the class MapRedExecutor method getMapRed.
/**
* This method returns a Job instance out of a {@link MSCR} entity. It puts the Class of
* the {@link PlumeWorkflow} argument and the MSCR id in the hadoop configuration.
*
* @param mscr The MSCR to convert
* @param workflow The workflow whose class will be instantiated by hadoop mappers/reducers
* @param outputPath The output path of the MapRed job
* @return A hadoop-executable MapRed Job
*
* @throws IOException
*/
static Job getMapRed(final MSCR mscr, PlumeWorkflow workFlow, String workFlowOutputPath, String outputPath) throws IOException {
Configuration conf = new Configuration();
conf.set(WORKFLOW_NAME, workFlow.getClass().getName());
conf.setInt(MSCR_ID, mscr.getId());
conf.set(TEMP_OUTPUT_PATH, workFlowOutputPath);
// TODO deprecation
Job job = new Job(conf, "MSCR");
job.setMapOutputKeyClass(PlumeObject.class);
job.setMapOutputValueClass(PlumeObject.class);
job.setJarByClass(MapRedExecutor.class);
/**
* Define multiple inputs
*/
for (PCollection<?> input : mscr.getInputs()) {
if (!(input instanceof LazyCollection)) {
throw new IllegalArgumentException("Can't create MapRed from MSCR whose inputs are not LazyTable");
}
LazyCollection<Text> l = (LazyCollection<Text>) input;
if (!(l.isMaterialized() && l.getFile() != null)) {
// Collections have plume ID only if they are intermediate results - TODO better naming for this
if (l.getPlumeId().length() < 1) {
throw new IllegalArgumentException("Can't create MapRed from MSCR inputs that are not materialized to a file");
}
}
PCollectionType<?> rType = l.getType();
Class<? extends InputFormat> format = SequenceFileInputFormat.class;
if (rType instanceof PTableType) {
PTableType<?, ?> tType = (PTableType<?, ?>) rType;
if (tType.valueType() instanceof StringType && tType.keyType() instanceof StringType) {
format = KeyValueTextInputFormat.class;
}
MultipleInputs.addInputPath(job, new Path(l.getFile()), format, MSCRMapper.class);
} else {
if (rType.elementType() instanceof StringType) {
format = TextInputFormat.class;
}
MultipleInputs.addInputPath(job, new Path(l.getFile()), format, MSCRMapper.class);
}
}
/**
* Define multiple outputs
*/
FileOutputFormat.setOutputPath(job, new Path(outputPath));
for (Map.Entry<PCollection<?>, Integer> entry : mscr.getNumberedChannels().entrySet()) {
PCollectionType<?> rType = ((LazyCollection<?>) mscr.getOutputChannels().get(entry.getKey()).output).getType();
if (rType instanceof PTableType) {
PTableType<?, ?> tType = (PTableType<?, ?>) rType;
Class<? extends OutputFormat> outputFormat = SequenceFileOutputFormat.class;
if (tType.keyType() instanceof StringType && tType.valueType() instanceof StringType) {
outputFormat = TextOutputFormat.class;
}
MultipleOutputs.addNamedOutput(job, entry.getValue() + "", outputFormat, getHadoopType(tType.keyType()), getHadoopType(tType.valueType()));
} else {
Class<? extends OutputFormat> outputFormat = SequenceFileOutputFormat.class;
if (rType.elementType() instanceof StringType) {
outputFormat = TextOutputFormat.class;
}
MultipleOutputs.addNamedOutput(job, entry.getValue() + "", outputFormat, NullWritable.class, getHadoopType(rType.elementType()));
}
}
/**
* Define Reducer & Combiner
*/
job.setCombinerClass(MSCRCombiner.class);
job.setReducerClass(MSCRReducer.class);
job.setNumReduceTasks(1);
return job;
}
use of com.tdunning.plume.PCollection in project Plume by tdunning.
the class Optimizer method removeUnnecessaryOps.
/**
* Removes unnecesary operations that are not removed by the Optimizer. It goes top-down (receives an Input).
* Returns true if passed node doesn't lead to an output.
*/
boolean removeUnnecessaryOps(PCollection arg, List<PCollection> outputs) {
LazyCollection<?> input = (LazyCollection) arg;
if (input.getDownOps() == null || input.getDownOps().size() == 0) {
// Leaf node
return !outputs.contains(input);
}
// create new list of deferredops that are 'usefull'
List<DeferredOp> finalDOps = new ArrayList<DeferredOp>();
for (DeferredOp op : input.getDownOps()) {
boolean remove = false;
if (op instanceof OneToOneOp) {
remove = removeUnnecessaryOps(((OneToOneOp<?, ?>) op).getDest(), outputs);
} else if (op instanceof ParallelDo) {
remove = removeUnnecessaryOps(((ParallelDo<?, ?>) op).getDest(), outputs);
} else if (op instanceof Flatten) {
remove = removeUnnecessaryOps(((Flatten<?>) op).getDest(), outputs);
} else if (op instanceof MultipleParallelDo) {
MultipleParallelDo<?> mPDo = (MultipleParallelDo<?>) op;
// begin with 1 because we will apply an AND gate with the childs
remove = true;
for (Object entry : mPDo.getDests().entrySet()) {
PCollection<?> pCol = (PCollection<?>) ((Map.Entry) entry).getKey();
remove = remove & removeUnnecessaryOps(pCol, outputs);
}
}
if (!remove) {
finalDOps.add(op);
}
}
input.downOps = finalDOps;
// if true this node can also be removed
return finalDOps.size() == 0;
}
use of com.tdunning.plume.PCollection in project Plume by tdunning.
the class Optimizer method sinkFlattens.
/**
* Sink flattens pushing them down to create opportunities for ParallelDo fusion
* @param arg The collection that may contain flatten operations that we need to sink.
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
<T> void sinkFlattens(PCollection<T> arg) {
LazyCollection<T> output = (LazyCollection<T>) arg;
if (output.isMaterialized()) {
// stop condition for recursive algorithm
return;
}
DeferredOp dOp = output.getDeferredOp();
if (!(dOp instanceof Flatten)) {
if (dOp instanceof OneToOneOp) {
// Recursively apply this function to parent
sinkFlattens(((OneToOneOp) dOp).getOrigin());
return;
} else if (dOp instanceof ParallelDo) {
// Recursively apply this function to parent
sinkFlattens(((ParallelDo) dOp).getOrigin());
return;
}
}
if (output.getDownOps() == null || output.getDownOps().size() != 1) {
// Recursively apply this function to parent
for (Object col : ((Flatten) dOp).getOrigins()) {
sinkFlattens((PCollection) col);
}
return;
}
DeferredOp downOp = output.getDownOps().get(0);
if (!(downOp instanceof ParallelDo)) {
return;
}
// PDo below current node
ParallelDo<T, ?> op = (ParallelDo<T, ?>) downOp;
// Flatten above current node
Flatten<T> flatten = (Flatten<T>) dOp;
List<PCollection<?>> newOrigins = new ArrayList<PCollection<?>>();
// Iterate over all Flatten inputs
for (PCollection<T> col : flatten.getOrigins()) {
// Recursively apply this function to this flatten's origin
LazyCollection<T> fInput = (LazyCollection<T>) col;
sinkFlattens(fInput);
// Sink
LazyCollection<?> newInput = new LazyCollection();
newInput.deferredOp = new ParallelDo(op.getFunction(), fInput, newInput);
newInput.type = ((LazyCollection) flatten.getDest()).getType();
fInput.downOps.remove(0);
fInput.addDownOp(newInput.deferredOp);
newOrigins.add(newInput);
}
Flatten<?> newFlatten = new Flatten(newOrigins, op.getDest());
((LazyCollection<?>) op.getDest()).deferredOp = newFlatten;
for (PCollection<?> newOp : newOrigins) {
((LazyCollection<?>) newOp).addDownOp(newFlatten);
}
}
use of com.tdunning.plume.PCollection in project Plume by tdunning.
the class Optimizer method optimize.
/**
* Optimizes an execution tree
*
* @param inputs A list of the inputs.
* @param outputs A list of the outputs.
* @return An optimized dataflow that consists of MSCR operations decorated with functional
* compositions.
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
public ExecutionStep optimize(List<PCollection> inputs, List<PCollection> outputs) {
if (outputs == null || outputs.size() == 0) {
throw new IllegalArgumentException("Empty output list");
}
if (inputs == null || inputs.size() == 0) {
throw new IllegalArgumentException("Empty input list");
}
for (PCollection output : outputs) {
sinkFlattens(output);
}
for (PCollection output : outputs) {
fuseParallelDos(output);
}
for (PCollection output : outputs) {
fuseSiblingParallelDos(output);
}
// Clean optimized tree
for (PCollection input : inputs) {
removeUnnecessaryOps(input, outputs);
}
Set<MSCR> mscrs = OptimizerTools.getMSCRBlocks(outputs);
// Build a map of output -> MSCR step
Map<PCollection<?>, MSCR> outputMap = new HashMap<PCollection<?>, MSCR>();
for (MSCR mscr : mscrs) {
for (Map.Entry<PCollection<?>, MSCR.OutputChannel<?, ?, ?>> entry : mscr.getOutputChannels().entrySet()) {
MSCR.OutputChannel<?, ?, ?> oC = entry.getValue();
outputMap.put(oC.output, mscr);
}
}
// Calculate dependencies between MSCRs
Map<MSCR, Set<MSCR>> dependencyMap = new HashMap<MSCR, Set<MSCR>>();
Set<MSCR> beginningMscrs = new HashSet<MSCR>();
for (MSCR mscr : mscrs) {
for (PCollection<?> input : mscr.getInputs()) {
if (inputs.contains(input)) {
beginningMscrs.add(mscr);
}
MSCR dependsOn = outputMap.get(input);
if (dependsOn == null) {
continue;
}
Set<MSCR> dependencies = dependencyMap.get(mscr);
if (dependencies == null) {
dependencies = new HashSet<MSCR>();
}
dependencies.add(dependsOn);
dependencyMap.put(mscr, dependencies);
}
}
ExecutionStep firstStep = new ExecutionStep();
for (MSCR step : beginningMscrs) {
if (dependencyMap.get(step) == null) {
firstStep.mscrSteps.add(step);
}
}
// Calculate execution plan
Set<MSCR> solvedSteps = new HashSet<MSCR>();
solvedSteps.addAll(firstStep.mscrSteps);
ExecutionStep previousStep = firstStep;
while (!solvedSteps.containsAll(mscrs)) {
ExecutionStep nextStep = new ExecutionStep();
for (MSCR mscr : mscrs) {
if (solvedSteps.contains(mscr)) {
continue;
}
Set<MSCR> dependencies = dependencyMap.get(mscr);
if (solvedSteps.containsAll(dependencies)) {
nextStep.mscrSteps.add(mscr);
solvedSteps.add(mscr);
}
}
previousStep.nextStep = nextStep;
}
return firstStep;
}
use of com.tdunning.plume.PCollection in project Plume by tdunning.
the class OptimizerTools method getMSCRBlocks.
/**
* This utility returns all the different MSCR blocks that can be created from this plan
*
* (pere) As of Oct/2010, I think this code can be simplified to be more like addRemainingTrivialMSCRs(), so a possible TODO would be
* to refactor it and make it more understandable. An opened question is whether there is an easy way of coding finding all possible
* MSCRs (including trivial, not related to GroupByKey operations ones) in a single and elegant loop.
*/
@SuppressWarnings({ "rawtypes", "unchecked" })
static Set<MSCR> getMSCRBlocks(List<PCollection> outputs) {
// Get all GroupByKeys from the tree
List<DeferredOp> groupBys = OptimizerTools.getAll(outputs, GroupByKey.class);
int mscrId = 1;
Set<MSCR> mscrs = new HashSet<MSCR>();
// For all found GroupByKey blocks
for (DeferredOp gBK : groupBys) {
GroupByKey groupBy = (GroupByKey<?, ?>) gBK;
// Gather all information needed for MSCR from this GBK
Set<PCollection<?>> inputs = new HashSet<PCollection<?>>();
Set<GroupByKey<?, ?>> outputChannels = new HashSet<GroupByKey<?, ?>>();
Set<Flatten<?>> unGroupedOutputChannels = new HashSet<Flatten<?>>();
Set<PCollection<?>> bypassChannels = new HashSet<PCollection<?>>();
Stack<LazyCollection<?>> toVisit = new Stack<LazyCollection<?>>();
Set<LazyCollection<?>> visited = new HashSet<LazyCollection<?>>();
LazyCollection<?> origin = (LazyCollection<?>) groupBy.getOrigin();
toVisit.push(origin);
outputChannels.add(groupBy);
while (!toVisit.isEmpty()) {
LazyCollection<?> current = toVisit.pop();
visited.add(current);
if (current.isMaterialized()) {
// condition for being a materialized input. This may change.
inputs.add(current);
continue;
}
DeferredOp op = current.getDeferredOp();
if (op instanceof MultipleParallelDo) {
// second condition for being an input
MultipleParallelDo<?> mPDo = (MultipleParallelDo) current.getDeferredOp();
if (((LazyCollection<?>) mPDo.getOrigin()).isMaterialized()) {
// will be done in Mapper
inputs.add(mPDo.getOrigin());
} else if (op instanceof ParallelDo) {
// will be done in Reducer
inputs.add(current);
} else {
// will be done in Mapper
inputs.add(mPDo.getOrigin());
}
// Check for bypass channels & output channels with no group-by
for (Map.Entry entry : mPDo.getDests().entrySet()) {
LazyCollection coll = (LazyCollection) entry.getKey();
if (coll.getDownOps() == null || coll.getDownOps().size() == 0) {
// leaf node
bypassChannels.add(coll);
} else if (coll.getDownOps().get(0) instanceof MultipleParallelDo) {
bypassChannels.add(coll);
/*
* Case of an output channel that Flattens with no Group By
*/
} else if (coll.getDownOps().get(0) instanceof Flatten) {
Flatten<?> thisFlatten = (Flatten<?>) coll.getDownOps().get(0);
LazyCollection ldest = (LazyCollection) thisFlatten.getDest();
if (ldest.getDownOps() == null || ldest.getDownOps().size() == 0 || ldest.getDownOps().get(0) instanceof MultipleParallelDo) {
unGroupedOutputChannels.add(thisFlatten);
// Add the rest of this flatten's origins to the stack in order to possibly discover more output channels
for (PCollection<?> col : thisFlatten.getOrigins()) {
if (!visited.contains(col)) {
toVisit.push((LazyCollection<?>) col);
}
}
}
}
}
continue;
}
if (op instanceof GroupByKey) {
// third condition for being an input - rare case when one GBK follows another
inputs.add(current);
continue;
}
if (op instanceof Flatten) {
Flatten<?> flatten = (Flatten<?>) op;
for (PCollection<?> input : flatten.getOrigins()) {
LazyCollection<?> in = (LazyCollection<?>) input;
if (!visited.contains(in)) {
toVisit.push(in);
}
}
continue;
}
if (op instanceof OneToOneOp) {
LazyCollection<?> input = (LazyCollection<?>) ((OneToOneOp<?, ?>) op).getOrigin();
if (!visited.contains(input)) {
toVisit.push(input);
}
continue;
}
}
MSCR mscrToAdd = null;
// Check if there is already one MSCR with at least one of this inputs
for (MSCR mscr : mscrs) {
for (PCollection<?> input : inputs) {
if (mscr.hasInput(input)) {
mscrToAdd = mscr;
break;
}
}
}
if (mscrToAdd == null) {
// otherwise create new MSCR
mscrToAdd = new MSCR(mscrId);
mscrId++;
}
// Add all missing input channels to current MSCR
for (PCollection<?> input : inputs) {
if (!mscrToAdd.hasInput(input)) {
mscrToAdd.addInput(input);
}
}
// Add all missing bypass outputs to current MSCR
for (PCollection<?> col : bypassChannels) {
if (!mscrToAdd.hasOutputChannel(col)) {
// Create new by-pass channel
MSCR.OutputChannel oC = new MSCR.OutputChannel(col);
mscrToAdd.addOutputChannel(oC);
}
}
// Add all missing flatten-with-no-groupby outputs to current MSCR
for (Flatten flatten : unGroupedOutputChannels) {
if (!mscrToAdd.hasOutputChannel(flatten.getDest())) {
// Create new channel with flatten and nothing else
MSCR.OutputChannel oC = new MSCR.OutputChannel(flatten.getDest());
oC.output = flatten.getDest();
oC.flatten = flatten;
mscrToAdd.addOutputChannel(oC);
}
}
// Add all missing output channels to current MSCR
for (GroupByKey groupByKey : outputChannels) {
if (!mscrToAdd.hasOutputChannel(groupByKey.getOrigin())) {
// Create new channel with group by key. It might have combiner and reducer as well.
MSCR.OutputChannel oC = new MSCR.OutputChannel(groupByKey);
oC.output = groupByKey.getDest();
if (groupByKey.getOrigin().getDeferredOp() instanceof Flatten) {
oC.flatten = (Flatten) groupByKey.getOrigin().getDeferredOp();
}
if (groupByKey.getDest().getDownOps() != null && groupByKey.getDest().getDownOps().size() == 1) {
DeferredOp op = (DeferredOp) groupByKey.getDest().getDownOps().get(0);
if (op instanceof CombineValues) {
oC.combiner = (CombineValues) op;
oC.output = oC.combiner.getDest();
LazyCollection dest = (LazyCollection) oC.combiner.getDest();
if (dest.getDownOps() != null && dest.getDownOps().size() == 1) {
op = (DeferredOp) dest.getDownOps().get(0);
}
}
if (op instanceof ParallelDo) {
oC.reducer = (ParallelDo) op;
oC.output = oC.reducer.getDest();
}
}
mscrToAdd.addOutputChannel(oC);
}
}
// Add if needed
mscrs.add(mscrToAdd);
}
return addRemainingTrivialMSCRs(outputs, mscrId, mscrs);
}
Aggregations