Examples with PCollection - com.tdunning.plume.PCollection

Example 6 with PCollection

use of com.tdunning.plume.PCollection in project Plume by tdunning.

the class OptimizerTools method addRemainingTrivialMSCRs.

/**
   * This utility returns all the MSCRs that are not related to a GroupByKey - 
   *  the remaining trivial cases as described in FlumeJava paper
   *  
   *  These cases will be either:
   *  - Flattens that are followed by either a)MultipleParallelDo or b)nothing
   *  
   *    (These ones can have correlated inputs and be parallelized just like the ones with GroupByKey)
   *    
   *  - The trivial Input->ParalleDo|MultipleParalleDo->Output case
   *  
   * @param outputs
   * @return
   */
@SuppressWarnings({ "unchecked", "rawtypes" })
static Set<MSCR> addRemainingTrivialMSCRs(List<PCollection> outputs, int currentMscrId, Set<MSCR> currentMSCRs) {
    // Get all Flatten from the tree
    List<DeferredOp> flattens = OptimizerTools.getAll(outputs, Flatten.class);
    List<MSCR> trivialMSCRS = new LinkedList<MSCR>();
    Iterator<DeferredOp> it = flattens.iterator();
    mainLoop: while (it.hasNext()) {
        Flatten<?> flatten = (Flatten<?>) it.next();
        // Process only remaining flattens that are not in any other MSCR
        for (MSCR mscr : currentMSCRs) {
            for (Map.Entry<PCollection<?>, MSCR.OutputChannel<?, ?, ?>> entry : mscr.getOutputChannels().entrySet()) {
                if (entry.getValue().flatten != null && entry.getValue().flatten == flatten) {
                    // skip this flatten
                    continue mainLoop;
                }
            }
        }
        // Create new trivial MSCR
        MSCR mscr = new MSCR(currentMscrId);
        currentMscrId++;
        // add single output channel
        MSCR.OutputChannel oC = new MSCR.OutputChannel(flatten.getDest());
        oC.output = flatten.getDest();
        oC.flatten = flatten;
        mscr.addOutputChannel(oC);
        // add inputs
        for (PCollection coll : flatten.getOrigins()) {
            LazyCollection lCol = (LazyCollection) coll;
            if (lCol.isMaterialized()) {
                mscr.addInput(coll);
            } else if (lCol.deferredOp instanceof ParallelDo) {
                ParallelDo pDo = (ParallelDo) lCol.deferredOp;
                if (((LazyCollection) pDo.getOrigin()).isMaterialized()) {
                    mscr.addInput(pDo.getOrigin());
                } else if (pDo instanceof MultipleParallelDo) {
                    mscr.addInput(pDo.getOrigin());
                } else {
                    mscr.addInput(coll);
                }
            } else {
                mscr.addInput(coll);
            }
        }
        Iterator<MSCR> tIt = trivialMSCRS.iterator();
        // Now we'll see if this trivial MSCR can be fused to another previous trivial MSCR
        boolean canBeFused = false;
        while (tIt.hasNext() && !canBeFused) {
            MSCR trivialMSCR = tIt.next();
            for (PCollection input : trivialMSCR.getInputs()) {
                if (mscr.getInputs().contains(input)) {
                    canBeFused = true;
                    break;
                }
            }
            if (canBeFused) {
                // add current output channel
                trivialMSCR.addOutputChannel(oC);
                for (PCollection input : mscr.getInputs()) {
                    if (!trivialMSCR.getInputs().contains(input)) {
                        // add each input that is not already contained
                        trivialMSCR.addInput(input);
                    }
                }
            }
        }
        // We have a new trivial MSCR only if it could not be fused with previous ones
        if (!canBeFused) {
            trivialMSCRS.add(mscr);
        }
    }
    currentMSCRs.addAll(trivialMSCRS);
    return currentMSCRs;
}

Also used : ParallelDo(com.tdunning.plume.local.lazy.op.ParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) Flatten(com.tdunning.plume.local.lazy.op.Flatten) DeferredOp(com.tdunning.plume.local.lazy.op.DeferredOp) LinkedList(java.util.LinkedList) PCollection(com.tdunning.plume.PCollection) Iterator(java.util.Iterator)

Example 7 with PCollection

use of com.tdunning.plume.PCollection in project Plume by tdunning.

the class BasicOptimizerTest method testParallelDoSiblingFusion.

@SuppressWarnings("unchecked")
@Test
public void testParallelDoSiblingFusion() {
    // Get Plume runtime
    LazyPlume plume = new LazyPlume();
    // Create simple data 
    PCollection<Integer> input = plume.fromJava(Lists.newArrayList(1, 2, 3));
    PCollection<Integer> output1 = input.map(plusOne, null);
    PCollection<Integer> output2 = input.map(timesTwo, null);
    LazyCollection<Integer> lInput = (LazyCollection<Integer>) input;
    LazyCollection<Integer> lOutput1 = (LazyCollection<Integer>) output1;
    LazyCollection<Integer> lOutput2 = (LazyCollection<Integer>) output2;
    assertEquals(lInput.downOps.size(), 2);
    // Execute and assert the result before optimizing
    executeAndAssert(lOutput1, new Integer[] { 2, 3, 4 });
    executeAndAssert(lOutput2, new Integer[] { 2, 4, 6 });
    // Get an Optimizer
    Optimizer optimizer = new Optimizer();
    // one output is enough to fuse both because they share the parent
    optimizer.fuseSiblingParallelDos(output1);
    // Check that input child ops has shrinked to 1
    assertEquals(lInput.downOps.size(), 1);
    DeferredOp op = lInput.downOps.get(0);
    // Check that there is only one op pointing to both outputs
    assertEquals(op, lOutput1.deferredOp);
    assertEquals(op, lOutput2.deferredOp);
    assertTrue(op instanceof MultipleParallelDo);
    MultipleParallelDo<Integer> mPDo = (MultipleParallelDo<Integer>) op;
    Map<PCollection<?>, DoFn<Integer, ?>> mapOfPDos = mPDo.getDests();
    // Check that the map of functions in MultipleParallelDo is correct
    assertEquals(mapOfPDos.get(output1), plusOne);
    assertEquals(mapOfPDos.get(output2), timesTwo);
    // Execute and assert the result afer optimizing
    executeAndAssert(lOutput1, new Integer[] { 2, 3, 4 });
    executeAndAssert(lOutput2, new Integer[] { 2, 4, 6 });
}

Also used : PCollection(com.tdunning.plume.PCollection) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) DoFn(com.tdunning.plume.DoFn) DeferredOp(com.tdunning.plume.local.lazy.op.DeferredOp) Test(org.junit.Test)

Example 8 with PCollection

use of com.tdunning.plume.PCollection in project Plume by tdunning.

the class MSCR method toString.

@Override
public String toString() {
    String str = "MSCR Id " + id + " #inputs " + inputs.size() + " #outputs " + outputChannels.size() + "\n" + " -Inputs- ";
    for (PCollection input : inputs) {
        str += input + " with down ops " + ((LazyCollection) input).downOps + " | ";
    }
    str += "\n -Outputs- " + outputChannels;
    return str;
}

Also used : PCollection(com.tdunning.plume.PCollection)

Example 9 with PCollection

use of com.tdunning.plume.PCollection in project Plume by tdunning.

the class MSCRMapper method map.

@SuppressWarnings("unchecked")
protected void map(WritableComparable key, WritableComparable value, final Mapper<WritableComparable, WritableComparable, PlumeObject, PlumeObject>.Context<WritableComparable, WritableComparable, PlumeObject, PlumeObject> context) throws IOException, InterruptedException {
    LazyCollection<?> l = null;
    FileSplit fS = FileInputSplitWrapper.getFileInputSplit(context);
    // Get LazyCollection for this input (according to FileSplit)
    for (PCollection<?> input : mscr.getInputs()) {
        LazyCollection<?> thisL = (LazyCollection<?>) input;
        if (thisL.getFile() == null) {
            // Convention for intermediate results
            thisL.setFile(tmpFolder + "/" + thisL.getPlumeId());
        }
        if (fS.getPath().toString().startsWith(thisL.getFile()) || fS.getPath().toString().startsWith("file:" + thisL.getFile())) {
            l = thisL;
            break;
        }
    }
    if (l == null) {
        throw new RuntimeException("Unable to match input split with any MSCR input");
    }
    // If this collection is a table -> process Pair, otherwise process value
    PCollectionType type = l.getType();
    Object toProcess = value;
    if (type instanceof PTableType) {
        toProcess = Pair.create(key, value);
    }
    for (DeferredOp op : l.getDownOps()) {
        if (op instanceof MultipleParallelDo) {
            MultipleParallelDo mPDo = ((MultipleParallelDo) op);
            for (Object entry : mPDo.getDests().entrySet()) {
                Map.Entry<PCollection, DoFn> en = (Map.Entry<PCollection, DoFn>) entry;
                LazyCollection<?> lCol = (LazyCollection<?>) en.getKey();
                DeferredOp childOp = null;
                if (lCol.getDownOps() != null && lCol.getDownOps().size() > 0) {
                    childOp = lCol.getDownOps().get(0);
                }
                final Integer channel;
                if (childOp != null && childOp instanceof Flatten) {
                    channel = mscr.getNumberedChannels().get(((Flatten) childOp).getDest());
                } else if (childOp != null && childOp instanceof GroupByKey) {
                    channel = mscr.getNumberedChannels().get(((GroupByKey) childOp).getOrigin());
                } else {
                    // bypass channel?
                    channel = mscr.getNumberedChannels().get(en.getKey());
                }
                if (channel == null) {
                    // This is not for this MSCR - just skip it
                    return;
                }
                // Call parallelDo function
                en.getValue().process(toProcess, new EmitFn() {

                    @Override
                    public void emit(Object v) {
                        try {
                            if (v instanceof Pair) {
                                Pair p = (Pair) v;
                                context.write(new PlumeObject((WritableComparable) p.getKey(), channel), new PlumeObject((WritableComparable) p.getValue(), channel));
                            } else {
                                context.write(new PlumeObject((WritableComparable) v, channel), new PlumeObject((WritableComparable) v, channel));
                            }
                        } catch (Exception e) {
                            // TODO How to report this
                            e.printStackTrace();
                        }
                    }
                });
            }
        } else {
            if (op instanceof Flatten) {
                l = (LazyCollection) ((Flatten) op).getDest();
            }
            int channel = mscr.getNumberedChannels().get(l);
            if (toProcess instanceof Pair) {
                context.write(new PlumeObject(key, channel), new PlumeObject(value, channel));
            } else {
                context.write(new PlumeObject(value, channel), new PlumeObject(value, channel));
            }
        }
    }
}

Also used : MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) GroupByKey(com.tdunning.plume.local.lazy.op.GroupByKey) PlumeObject(com.tdunning.plume.local.lazy.MapRedExecutor.PlumeObject) PTableType(com.tdunning.plume.types.PTableType) Flatten(com.tdunning.plume.local.lazy.op.Flatten) PCollectionType(com.tdunning.plume.types.PCollectionType) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) DeferredOp(com.tdunning.plume.local.lazy.op.DeferredOp) IOException(java.io.IOException) PCollection(com.tdunning.plume.PCollection) DoFn(com.tdunning.plume.DoFn) EmitFn(com.tdunning.plume.EmitFn) PlumeObject(com.tdunning.plume.local.lazy.MapRedExecutor.PlumeObject) Map(java.util.Map) Pair(com.tdunning.plume.Pair)

Example 10 with PCollection

use of com.tdunning.plume.PCollection in project Plume by tdunning.

the class TestOptimizer method testFigure5.

@SuppressWarnings({ "rawtypes", "unchecked" })
public void testFigure5() {
    // Get Plume runtime
    LazyPlume plume = new LazyPlume();
    // Create simple data 
    PCollection input1 = plume.fromJava(Lists.newArrayList(Pair.create(1, 1)));
    PCollection input2 = plume.fromJava(Lists.newArrayList(Pair.create(2, 2)));
    PCollection input3 = plume.fromJava(Lists.newArrayList(Pair.create(3, 3)));
    PCollection input4 = plume.fromJava(Lists.newArrayList(Pair.create(4, 4)));
    PCollection partial1 = input1.map(identity, tableOf(integers(), integers()));
    PCollection partial2 = plume.flatten(tableOf(integers(), integers()), input2.map(identity, tableOf(integers(), integers())), input3.map(identity, tableOf(integers(), integers())).map(identity, null).map(identity, null));
    PCollection partial3 = input4.map(identity, tableOf(integers(), integers())).groupByKey().combine(dummyCombiner).map(identity, null);
    PCollection output = plume.flatten(tableOf(integers(), integers()), partial1, partial2, partial3).groupByKey().map(identity, null);
    Optimizer optimizer = new Optimizer();
    ExecutionStep step = optimizer.optimize(Lists.newArrayList(input1, input2, input3, input4), Lists.newArrayList(output, partial1));
    assertEquals(step.mscrSteps.size(), 1);
    assertNotNull(step.nextStep);
    assertEquals(step.nextStep.mscrSteps.size(), 1);
    assertNull(step.nextStep.nextStep);
}

Also used : PCollection(com.tdunning.plume.PCollection)

Aggregations

PCollection (com.tdunning.plume.PCollection)13 DeferredOp (com.tdunning.plume.local.lazy.op.DeferredOp)6 MultipleParallelDo (com.tdunning.plume.local.lazy.op.MultipleParallelDo)6 Flatten (com.tdunning.plume.local.lazy.op.Flatten)5 ParallelDo (com.tdunning.plume.local.lazy.op.ParallelDo)5 Map (java.util.Map)4 DoFn (com.tdunning.plume.DoFn)3 PlumeObject (com.tdunning.plume.local.lazy.MapRedExecutor.PlumeObject)3 OneToOneOp (com.tdunning.plume.local.lazy.op.OneToOneOp)3 EmitFn (com.tdunning.plume.EmitFn)2 Pair (com.tdunning.plume.Pair)2 OutputChannel (com.tdunning.plume.local.lazy.MSCR.OutputChannel)2 GroupByKey (com.tdunning.plume.local.lazy.op.GroupByKey)2 PTableType (com.tdunning.plume.types.PTableType)2 IOException (java.io.IOException)2 ArrayList (java.util.ArrayList)2 HashSet (java.util.HashSet)2 WritableComparable (org.apache.hadoop.io.WritableComparable)2 Test (org.junit.Test)2 PTable (com.tdunning.plume.PTable)1