use of com.tdunning.plume.PCollection in project Plume by tdunning.
the class OptimizerTools method addRemainingTrivialMSCRs.
/**
* This utility returns all the MSCRs that are not related to a GroupByKey -
* the remaining trivial cases as described in FlumeJava paper
*
* These cases will be either:
* - Flattens that are followed by either a)MultipleParallelDo or b)nothing
*
* (These ones can have correlated inputs and be parallelized just like the ones with GroupByKey)
*
* - The trivial Input->ParalleDo|MultipleParalleDo->Output case
*
* @param outputs
* @return
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
static Set<MSCR> addRemainingTrivialMSCRs(List<PCollection> outputs, int currentMscrId, Set<MSCR> currentMSCRs) {
// Get all Flatten from the tree
List<DeferredOp> flattens = OptimizerTools.getAll(outputs, Flatten.class);
List<MSCR> trivialMSCRS = new LinkedList<MSCR>();
Iterator<DeferredOp> it = flattens.iterator();
mainLoop: while (it.hasNext()) {
Flatten<?> flatten = (Flatten<?>) it.next();
// Process only remaining flattens that are not in any other MSCR
for (MSCR mscr : currentMSCRs) {
for (Map.Entry<PCollection<?>, MSCR.OutputChannel<?, ?, ?>> entry : mscr.getOutputChannels().entrySet()) {
if (entry.getValue().flatten != null && entry.getValue().flatten == flatten) {
// skip this flatten
continue mainLoop;
}
}
}
// Create new trivial MSCR
MSCR mscr = new MSCR(currentMscrId);
currentMscrId++;
// add single output channel
MSCR.OutputChannel oC = new MSCR.OutputChannel(flatten.getDest());
oC.output = flatten.getDest();
oC.flatten = flatten;
mscr.addOutputChannel(oC);
// add inputs
for (PCollection coll : flatten.getOrigins()) {
LazyCollection lCol = (LazyCollection) coll;
if (lCol.isMaterialized()) {
mscr.addInput(coll);
} else if (lCol.deferredOp instanceof ParallelDo) {
ParallelDo pDo = (ParallelDo) lCol.deferredOp;
if (((LazyCollection) pDo.getOrigin()).isMaterialized()) {
mscr.addInput(pDo.getOrigin());
} else if (pDo instanceof MultipleParallelDo) {
mscr.addInput(pDo.getOrigin());
} else {
mscr.addInput(coll);
}
} else {
mscr.addInput(coll);
}
}
Iterator<MSCR> tIt = trivialMSCRS.iterator();
// Now we'll see if this trivial MSCR can be fused to another previous trivial MSCR
boolean canBeFused = false;
while (tIt.hasNext() && !canBeFused) {
MSCR trivialMSCR = tIt.next();
for (PCollection input : trivialMSCR.getInputs()) {
if (mscr.getInputs().contains(input)) {
canBeFused = true;
break;
}
}
if (canBeFused) {
// add current output channel
trivialMSCR.addOutputChannel(oC);
for (PCollection input : mscr.getInputs()) {
if (!trivialMSCR.getInputs().contains(input)) {
// add each input that is not already contained
trivialMSCR.addInput(input);
}
}
}
}
// We have a new trivial MSCR only if it could not be fused with previous ones
if (!canBeFused) {
trivialMSCRS.add(mscr);
}
}
currentMSCRs.addAll(trivialMSCRS);
return currentMSCRs;
}
use of com.tdunning.plume.PCollection in project Plume by tdunning.
the class BasicOptimizerTest method testParallelDoSiblingFusion.
@SuppressWarnings("unchecked")
@Test
public void testParallelDoSiblingFusion() {
// Get Plume runtime
LazyPlume plume = new LazyPlume();
// Create simple data
PCollection<Integer> input = plume.fromJava(Lists.newArrayList(1, 2, 3));
PCollection<Integer> output1 = input.map(plusOne, null);
PCollection<Integer> output2 = input.map(timesTwo, null);
LazyCollection<Integer> lInput = (LazyCollection<Integer>) input;
LazyCollection<Integer> lOutput1 = (LazyCollection<Integer>) output1;
LazyCollection<Integer> lOutput2 = (LazyCollection<Integer>) output2;
assertEquals(lInput.downOps.size(), 2);
// Execute and assert the result before optimizing
executeAndAssert(lOutput1, new Integer[] { 2, 3, 4 });
executeAndAssert(lOutput2, new Integer[] { 2, 4, 6 });
// Get an Optimizer
Optimizer optimizer = new Optimizer();
// one output is enough to fuse both because they share the parent
optimizer.fuseSiblingParallelDos(output1);
// Check that input child ops has shrinked to 1
assertEquals(lInput.downOps.size(), 1);
DeferredOp op = lInput.downOps.get(0);
// Check that there is only one op pointing to both outputs
assertEquals(op, lOutput1.deferredOp);
assertEquals(op, lOutput2.deferredOp);
assertTrue(op instanceof MultipleParallelDo);
MultipleParallelDo<Integer> mPDo = (MultipleParallelDo<Integer>) op;
Map<PCollection<?>, DoFn<Integer, ?>> mapOfPDos = mPDo.getDests();
// Check that the map of functions in MultipleParallelDo is correct
assertEquals(mapOfPDos.get(output1), plusOne);
assertEquals(mapOfPDos.get(output2), timesTwo);
// Execute and assert the result afer optimizing
executeAndAssert(lOutput1, new Integer[] { 2, 3, 4 });
executeAndAssert(lOutput2, new Integer[] { 2, 4, 6 });
}
use of com.tdunning.plume.PCollection in project Plume by tdunning.
the class MSCR method toString.
@Override
public String toString() {
String str = "MSCR Id " + id + " #inputs " + inputs.size() + " #outputs " + outputChannels.size() + "\n" + " -Inputs- ";
for (PCollection input : inputs) {
str += input + " with down ops " + ((LazyCollection) input).downOps + " | ";
}
str += "\n -Outputs- " + outputChannels;
return str;
}
use of com.tdunning.plume.PCollection in project Plume by tdunning.
the class MSCRMapper method map.
@SuppressWarnings("unchecked")
protected void map(WritableComparable key, WritableComparable value, final Mapper<WritableComparable, WritableComparable, PlumeObject, PlumeObject>.Context<WritableComparable, WritableComparable, PlumeObject, PlumeObject> context) throws IOException, InterruptedException {
LazyCollection<?> l = null;
FileSplit fS = FileInputSplitWrapper.getFileInputSplit(context);
// Get LazyCollection for this input (according to FileSplit)
for (PCollection<?> input : mscr.getInputs()) {
LazyCollection<?> thisL = (LazyCollection<?>) input;
if (thisL.getFile() == null) {
// Convention for intermediate results
thisL.setFile(tmpFolder + "/" + thisL.getPlumeId());
}
if (fS.getPath().toString().startsWith(thisL.getFile()) || fS.getPath().toString().startsWith("file:" + thisL.getFile())) {
l = thisL;
break;
}
}
if (l == null) {
throw new RuntimeException("Unable to match input split with any MSCR input");
}
// If this collection is a table -> process Pair, otherwise process value
PCollectionType type = l.getType();
Object toProcess = value;
if (type instanceof PTableType) {
toProcess = Pair.create(key, value);
}
for (DeferredOp op : l.getDownOps()) {
if (op instanceof MultipleParallelDo) {
MultipleParallelDo mPDo = ((MultipleParallelDo) op);
for (Object entry : mPDo.getDests().entrySet()) {
Map.Entry<PCollection, DoFn> en = (Map.Entry<PCollection, DoFn>) entry;
LazyCollection<?> lCol = (LazyCollection<?>) en.getKey();
DeferredOp childOp = null;
if (lCol.getDownOps() != null && lCol.getDownOps().size() > 0) {
childOp = lCol.getDownOps().get(0);
}
final Integer channel;
if (childOp != null && childOp instanceof Flatten) {
channel = mscr.getNumberedChannels().get(((Flatten) childOp).getDest());
} else if (childOp != null && childOp instanceof GroupByKey) {
channel = mscr.getNumberedChannels().get(((GroupByKey) childOp).getOrigin());
} else {
// bypass channel?
channel = mscr.getNumberedChannels().get(en.getKey());
}
if (channel == null) {
// This is not for this MSCR - just skip it
return;
}
// Call parallelDo function
en.getValue().process(toProcess, new EmitFn() {
@Override
public void emit(Object v) {
try {
if (v instanceof Pair) {
Pair p = (Pair) v;
context.write(new PlumeObject((WritableComparable) p.getKey(), channel), new PlumeObject((WritableComparable) p.getValue(), channel));
} else {
context.write(new PlumeObject((WritableComparable) v, channel), new PlumeObject((WritableComparable) v, channel));
}
} catch (Exception e) {
// TODO How to report this
e.printStackTrace();
}
}
});
}
} else {
if (op instanceof Flatten) {
l = (LazyCollection) ((Flatten) op).getDest();
}
int channel = mscr.getNumberedChannels().get(l);
if (toProcess instanceof Pair) {
context.write(new PlumeObject(key, channel), new PlumeObject(value, channel));
} else {
context.write(new PlumeObject(value, channel), new PlumeObject(value, channel));
}
}
}
}
use of com.tdunning.plume.PCollection in project Plume by tdunning.
the class TestOptimizer method testFigure5.
@SuppressWarnings({ "rawtypes", "unchecked" })
public void testFigure5() {
// Get Plume runtime
LazyPlume plume = new LazyPlume();
// Create simple data
PCollection input1 = plume.fromJava(Lists.newArrayList(Pair.create(1, 1)));
PCollection input2 = plume.fromJava(Lists.newArrayList(Pair.create(2, 2)));
PCollection input3 = plume.fromJava(Lists.newArrayList(Pair.create(3, 3)));
PCollection input4 = plume.fromJava(Lists.newArrayList(Pair.create(4, 4)));
PCollection partial1 = input1.map(identity, tableOf(integers(), integers()));
PCollection partial2 = plume.flatten(tableOf(integers(), integers()), input2.map(identity, tableOf(integers(), integers())), input3.map(identity, tableOf(integers(), integers())).map(identity, null).map(identity, null));
PCollection partial3 = input4.map(identity, tableOf(integers(), integers())).groupByKey().combine(dummyCombiner).map(identity, null);
PCollection output = plume.flatten(tableOf(integers(), integers()), partial1, partial2, partial3).groupByKey().map(identity, null);
Optimizer optimizer = new Optimizer();
ExecutionStep step = optimizer.optimize(Lists.newArrayList(input1, input2, input3, input4), Lists.newArrayList(output, partial1));
assertEquals(step.mscrSteps.size(), 1);
assertNotNull(step.nextStep);
assertEquals(step.nextStep.mscrSteps.size(), 1);
assertNull(step.nextStep.nextStep);
}
Aggregations