Search in sources :

Example 1 with ParallelDo

use of com.tdunning.plume.local.lazy.op.ParallelDo in project Plume by tdunning.

the class Optimizer method removeUnnecessaryOps.

/**
   * Removes unnecesary operations that are not removed by the Optimizer. It goes top-down (receives an Input).
   * Returns true if passed node doesn't lead to an output.
   */
boolean removeUnnecessaryOps(PCollection arg, List<PCollection> outputs) {
    LazyCollection<?> input = (LazyCollection) arg;
    if (input.getDownOps() == null || input.getDownOps().size() == 0) {
        // Leaf node
        return !outputs.contains(input);
    }
    // create new list of deferredops that are 'usefull'
    List<DeferredOp> finalDOps = new ArrayList<DeferredOp>();
    for (DeferredOp op : input.getDownOps()) {
        boolean remove = false;
        if (op instanceof OneToOneOp) {
            remove = removeUnnecessaryOps(((OneToOneOp<?, ?>) op).getDest(), outputs);
        } else if (op instanceof ParallelDo) {
            remove = removeUnnecessaryOps(((ParallelDo<?, ?>) op).getDest(), outputs);
        } else if (op instanceof Flatten) {
            remove = removeUnnecessaryOps(((Flatten<?>) op).getDest(), outputs);
        } else if (op instanceof MultipleParallelDo) {
            MultipleParallelDo<?> mPDo = (MultipleParallelDo<?>) op;
            // begin with 1 because we will apply an AND gate with the childs
            remove = true;
            for (Object entry : mPDo.getDests().entrySet()) {
                PCollection<?> pCol = (PCollection<?>) ((Map.Entry) entry).getKey();
                remove = remove & removeUnnecessaryOps(pCol, outputs);
            }
        }
        if (!remove) {
            finalDOps.add(op);
        }
    }
    input.downOps = finalDOps;
    // if true this node can also be removed
    return finalDOps.size() == 0;
}
Also used : ParallelDo(com.tdunning.plume.local.lazy.op.ParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) ArrayList(java.util.ArrayList) Flatten(com.tdunning.plume.local.lazy.op.Flatten) OneToOneOp(com.tdunning.plume.local.lazy.op.OneToOneOp) DeferredOp(com.tdunning.plume.local.lazy.op.DeferredOp) PCollection(com.tdunning.plume.PCollection)

Example 2 with ParallelDo

use of com.tdunning.plume.local.lazy.op.ParallelDo in project Plume by tdunning.

the class Optimizer method sinkFlattens.

/**
   * Sink flattens pushing them down to create opportunities for ParallelDo fusion
   * @param arg  The collection that may contain flatten operations that we need to sink.
   */
@SuppressWarnings({ "unchecked", "rawtypes" })
<T> void sinkFlattens(PCollection<T> arg) {
    LazyCollection<T> output = (LazyCollection<T>) arg;
    if (output.isMaterialized()) {
        // stop condition for recursive algorithm
        return;
    }
    DeferredOp dOp = output.getDeferredOp();
    if (!(dOp instanceof Flatten)) {
        if (dOp instanceof OneToOneOp) {
            // Recursively apply this function to parent
            sinkFlattens(((OneToOneOp) dOp).getOrigin());
            return;
        } else if (dOp instanceof ParallelDo) {
            // Recursively apply this function to parent
            sinkFlattens(((ParallelDo) dOp).getOrigin());
            return;
        }
    }
    if (output.getDownOps() == null || output.getDownOps().size() != 1) {
        // Recursively apply this function to parent
        for (Object col : ((Flatten) dOp).getOrigins()) {
            sinkFlattens((PCollection) col);
        }
        return;
    }
    DeferredOp downOp = output.getDownOps().get(0);
    if (!(downOp instanceof ParallelDo)) {
        return;
    }
    // PDo below current node
    ParallelDo<T, ?> op = (ParallelDo<T, ?>) downOp;
    // Flatten above current node
    Flatten<T> flatten = (Flatten<T>) dOp;
    List<PCollection<?>> newOrigins = new ArrayList<PCollection<?>>();
    // Iterate over all Flatten inputs
    for (PCollection<T> col : flatten.getOrigins()) {
        // Recursively apply this function to this flatten's origin
        LazyCollection<T> fInput = (LazyCollection<T>) col;
        sinkFlattens(fInput);
        // Sink 
        LazyCollection<?> newInput = new LazyCollection();
        newInput.deferredOp = new ParallelDo(op.getFunction(), fInput, newInput);
        newInput.type = ((LazyCollection) flatten.getDest()).getType();
        fInput.downOps.remove(0);
        fInput.addDownOp(newInput.deferredOp);
        newOrigins.add(newInput);
    }
    Flatten<?> newFlatten = new Flatten(newOrigins, op.getDest());
    ((LazyCollection<?>) op.getDest()).deferredOp = newFlatten;
    for (PCollection<?> newOp : newOrigins) {
        ((LazyCollection<?>) newOp).addDownOp(newFlatten);
    }
}
Also used : ParallelDo(com.tdunning.plume.local.lazy.op.ParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) Flatten(com.tdunning.plume.local.lazy.op.Flatten) ArrayList(java.util.ArrayList) OneToOneOp(com.tdunning.plume.local.lazy.op.OneToOneOp) DeferredOp(com.tdunning.plume.local.lazy.op.DeferredOp) PCollection(com.tdunning.plume.PCollection)

Example 3 with ParallelDo

use of com.tdunning.plume.local.lazy.op.ParallelDo in project Plume by tdunning.

the class OptimizerTools method getMSCRBlocks.

/**
   * This utility returns all the different MSCR blocks that can be created from this plan
   * 
   * (pere) As of Oct/2010, I think this code can be simplified to be more like addRemainingTrivialMSCRs(), so a possible TODO would be
   *  to refactor it and make it more understandable. An opened question is whether there is an easy way of coding finding all possible
   *  MSCRs (including trivial, not related to GroupByKey operations ones) in a single and elegant loop.
   */
@SuppressWarnings({ "rawtypes", "unchecked" })
static Set<MSCR> getMSCRBlocks(List<PCollection> outputs) {
    // Get all GroupByKeys from the tree
    List<DeferredOp> groupBys = OptimizerTools.getAll(outputs, GroupByKey.class);
    int mscrId = 1;
    Set<MSCR> mscrs = new HashSet<MSCR>();
    // For all found GroupByKey blocks
    for (DeferredOp gBK : groupBys) {
        GroupByKey groupBy = (GroupByKey<?, ?>) gBK;
        // Gather all information needed for MSCR from this GBK
        Set<PCollection<?>> inputs = new HashSet<PCollection<?>>();
        Set<GroupByKey<?, ?>> outputChannels = new HashSet<GroupByKey<?, ?>>();
        Set<Flatten<?>> unGroupedOutputChannels = new HashSet<Flatten<?>>();
        Set<PCollection<?>> bypassChannels = new HashSet<PCollection<?>>();
        Stack<LazyCollection<?>> toVisit = new Stack<LazyCollection<?>>();
        Set<LazyCollection<?>> visited = new HashSet<LazyCollection<?>>();
        LazyCollection<?> origin = (LazyCollection<?>) groupBy.getOrigin();
        toVisit.push(origin);
        outputChannels.add(groupBy);
        while (!toVisit.isEmpty()) {
            LazyCollection<?> current = toVisit.pop();
            visited.add(current);
            if (current.isMaterialized()) {
                // condition for being a materialized input. This may change.
                inputs.add(current);
                continue;
            }
            DeferredOp op = current.getDeferredOp();
            if (op instanceof MultipleParallelDo) {
                // second condition for being an input
                MultipleParallelDo<?> mPDo = (MultipleParallelDo) current.getDeferredOp();
                if (((LazyCollection<?>) mPDo.getOrigin()).isMaterialized()) {
                    // will be done in Mapper
                    inputs.add(mPDo.getOrigin());
                } else if (op instanceof ParallelDo) {
                    // will be done in Reducer
                    inputs.add(current);
                } else {
                    // will be done in Mapper
                    inputs.add(mPDo.getOrigin());
                }
                // Check for bypass channels & output channels with no group-by
                for (Map.Entry entry : mPDo.getDests().entrySet()) {
                    LazyCollection coll = (LazyCollection) entry.getKey();
                    if (coll.getDownOps() == null || coll.getDownOps().size() == 0) {
                        // leaf node
                        bypassChannels.add(coll);
                    } else if (coll.getDownOps().get(0) instanceof MultipleParallelDo) {
                        bypassChannels.add(coll);
                    /*
             * Case of an output channel that Flattens with no Group By
             */
                    } else if (coll.getDownOps().get(0) instanceof Flatten) {
                        Flatten<?> thisFlatten = (Flatten<?>) coll.getDownOps().get(0);
                        LazyCollection ldest = (LazyCollection) thisFlatten.getDest();
                        if (ldest.getDownOps() == null || ldest.getDownOps().size() == 0 || ldest.getDownOps().get(0) instanceof MultipleParallelDo) {
                            unGroupedOutputChannels.add(thisFlatten);
                            // Add the rest of this flatten's origins to the stack in order to possibly discover more output channels
                            for (PCollection<?> col : thisFlatten.getOrigins()) {
                                if (!visited.contains(col)) {
                                    toVisit.push((LazyCollection<?>) col);
                                }
                            }
                        }
                    }
                }
                continue;
            }
            if (op instanceof GroupByKey) {
                // third condition for being an input - rare case when one GBK follows another
                inputs.add(current);
                continue;
            }
            if (op instanceof Flatten) {
                Flatten<?> flatten = (Flatten<?>) op;
                for (PCollection<?> input : flatten.getOrigins()) {
                    LazyCollection<?> in = (LazyCollection<?>) input;
                    if (!visited.contains(in)) {
                        toVisit.push(in);
                    }
                }
                continue;
            }
            if (op instanceof OneToOneOp) {
                LazyCollection<?> input = (LazyCollection<?>) ((OneToOneOp<?, ?>) op).getOrigin();
                if (!visited.contains(input)) {
                    toVisit.push(input);
                }
                continue;
            }
        }
        MSCR mscrToAdd = null;
        // Check if there is already one MSCR with at least one of this inputs
        for (MSCR mscr : mscrs) {
            for (PCollection<?> input : inputs) {
                if (mscr.hasInput(input)) {
                    mscrToAdd = mscr;
                    break;
                }
            }
        }
        if (mscrToAdd == null) {
            // otherwise create new MSCR
            mscrToAdd = new MSCR(mscrId);
            mscrId++;
        }
        // Add all missing input channels to current MSCR
        for (PCollection<?> input : inputs) {
            if (!mscrToAdd.hasInput(input)) {
                mscrToAdd.addInput(input);
            }
        }
        // Add all missing bypass outputs to current MSCR
        for (PCollection<?> col : bypassChannels) {
            if (!mscrToAdd.hasOutputChannel(col)) {
                // Create new by-pass channel
                MSCR.OutputChannel oC = new MSCR.OutputChannel(col);
                mscrToAdd.addOutputChannel(oC);
            }
        }
        // Add all missing flatten-with-no-groupby outputs to current MSCR
        for (Flatten flatten : unGroupedOutputChannels) {
            if (!mscrToAdd.hasOutputChannel(flatten.getDest())) {
                // Create new channel with flatten and nothing else
                MSCR.OutputChannel oC = new MSCR.OutputChannel(flatten.getDest());
                oC.output = flatten.getDest();
                oC.flatten = flatten;
                mscrToAdd.addOutputChannel(oC);
            }
        }
        // Add all missing output channels to current MSCR
        for (GroupByKey groupByKey : outputChannels) {
            if (!mscrToAdd.hasOutputChannel(groupByKey.getOrigin())) {
                // Create new channel with group by key. It might have combiner and reducer as well.
                MSCR.OutputChannel oC = new MSCR.OutputChannel(groupByKey);
                oC.output = groupByKey.getDest();
                if (groupByKey.getOrigin().getDeferredOp() instanceof Flatten) {
                    oC.flatten = (Flatten) groupByKey.getOrigin().getDeferredOp();
                }
                if (groupByKey.getDest().getDownOps() != null && groupByKey.getDest().getDownOps().size() == 1) {
                    DeferredOp op = (DeferredOp) groupByKey.getDest().getDownOps().get(0);
                    if (op instanceof CombineValues) {
                        oC.combiner = (CombineValues) op;
                        oC.output = oC.combiner.getDest();
                        LazyCollection dest = (LazyCollection) oC.combiner.getDest();
                        if (dest.getDownOps() != null && dest.getDownOps().size() == 1) {
                            op = (DeferredOp) dest.getDownOps().get(0);
                        }
                    }
                    if (op instanceof ParallelDo) {
                        oC.reducer = (ParallelDo) op;
                        oC.output = oC.reducer.getDest();
                    }
                }
                mscrToAdd.addOutputChannel(oC);
            }
        }
        // Add if needed
        mscrs.add(mscrToAdd);
    }
    return addRemainingTrivialMSCRs(outputs, mscrId, mscrs);
}
Also used : MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) ParallelDo(com.tdunning.plume.local.lazy.op.ParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) GroupByKey(com.tdunning.plume.local.lazy.op.GroupByKey) DeferredOp(com.tdunning.plume.local.lazy.op.DeferredOp) HashSet(java.util.HashSet) CombineValues(com.tdunning.plume.local.lazy.op.CombineValues) Flatten(com.tdunning.plume.local.lazy.op.Flatten) OneToOneOp(com.tdunning.plume.local.lazy.op.OneToOneOp) Stack(java.util.Stack) PCollection(com.tdunning.plume.PCollection) Map(java.util.Map)

Example 4 with ParallelDo

use of com.tdunning.plume.local.lazy.op.ParallelDo in project Plume by tdunning.

the class OptimizerTools method addRemainingTrivialMSCRs.

/**
   * This utility returns all the MSCRs that are not related to a GroupByKey - 
   *  the remaining trivial cases as described in FlumeJava paper
   *  
   *  These cases will be either:
   *  - Flattens that are followed by either a)MultipleParallelDo or b)nothing
   *  
   *    (These ones can have correlated inputs and be parallelized just like the ones with GroupByKey)
   *    
   *  - The trivial Input->ParalleDo|MultipleParalleDo->Output case
   *  
   * @param outputs
   * @return
   */
@SuppressWarnings({ "unchecked", "rawtypes" })
static Set<MSCR> addRemainingTrivialMSCRs(List<PCollection> outputs, int currentMscrId, Set<MSCR> currentMSCRs) {
    // Get all Flatten from the tree
    List<DeferredOp> flattens = OptimizerTools.getAll(outputs, Flatten.class);
    List<MSCR> trivialMSCRS = new LinkedList<MSCR>();
    Iterator<DeferredOp> it = flattens.iterator();
    mainLoop: while (it.hasNext()) {
        Flatten<?> flatten = (Flatten<?>) it.next();
        // Process only remaining flattens that are not in any other MSCR
        for (MSCR mscr : currentMSCRs) {
            for (Map.Entry<PCollection<?>, MSCR.OutputChannel<?, ?, ?>> entry : mscr.getOutputChannels().entrySet()) {
                if (entry.getValue().flatten != null && entry.getValue().flatten == flatten) {
                    // skip this flatten
                    continue mainLoop;
                }
            }
        }
        // Create new trivial MSCR
        MSCR mscr = new MSCR(currentMscrId);
        currentMscrId++;
        // add single output channel
        MSCR.OutputChannel oC = new MSCR.OutputChannel(flatten.getDest());
        oC.output = flatten.getDest();
        oC.flatten = flatten;
        mscr.addOutputChannel(oC);
        // add inputs
        for (PCollection coll : flatten.getOrigins()) {
            LazyCollection lCol = (LazyCollection) coll;
            if (lCol.isMaterialized()) {
                mscr.addInput(coll);
            } else if (lCol.deferredOp instanceof ParallelDo) {
                ParallelDo pDo = (ParallelDo) lCol.deferredOp;
                if (((LazyCollection) pDo.getOrigin()).isMaterialized()) {
                    mscr.addInput(pDo.getOrigin());
                } else if (pDo instanceof MultipleParallelDo) {
                    mscr.addInput(pDo.getOrigin());
                } else {
                    mscr.addInput(coll);
                }
            } else {
                mscr.addInput(coll);
            }
        }
        Iterator<MSCR> tIt = trivialMSCRS.iterator();
        // Now we'll see if this trivial MSCR can be fused to another previous trivial MSCR
        boolean canBeFused = false;
        while (tIt.hasNext() && !canBeFused) {
            MSCR trivialMSCR = tIt.next();
            for (PCollection input : trivialMSCR.getInputs()) {
                if (mscr.getInputs().contains(input)) {
                    canBeFused = true;
                    break;
                }
            }
            if (canBeFused) {
                // add current output channel
                trivialMSCR.addOutputChannel(oC);
                for (PCollection input : mscr.getInputs()) {
                    if (!trivialMSCR.getInputs().contains(input)) {
                        // add each input that is not already contained
                        trivialMSCR.addInput(input);
                    }
                }
            }
        }
        // We have a new trivial MSCR only if it could not be fused with previous ones
        if (!canBeFused) {
            trivialMSCRS.add(mscr);
        }
    }
    currentMSCRs.addAll(trivialMSCRS);
    return currentMSCRs;
}
Also used : ParallelDo(com.tdunning.plume.local.lazy.op.ParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) Flatten(com.tdunning.plume.local.lazy.op.Flatten) DeferredOp(com.tdunning.plume.local.lazy.op.DeferredOp) LinkedList(java.util.LinkedList) PCollection(com.tdunning.plume.PCollection) Iterator(java.util.Iterator)

Example 5 with ParallelDo

use of com.tdunning.plume.local.lazy.op.ParallelDo in project Plume by tdunning.

the class BasicOptimizerTest method testSinkFlattens.

/**
   * This test has two inputs, one flatten and then one ParallelDo.
   * After sinking flattens, the tree should be as: two inputs, one ParallelDo after each input and one final Flatten.
   */
@Test
public void testSinkFlattens() {
    // Get Plume runtime
    LazyPlume plume = new LazyPlume();
    // Create simple data 
    PCollection<Integer> input1 = plume.fromJava(Lists.newArrayList(1, 2, 3));
    PCollection<Integer> input2 = plume.fromJava(Lists.newArrayList(4, 5, 6));
    PCollection<Integer> output = plume.flatten(input1, input2).map(plusOne, null);
    LazyCollection<Integer> lOutput = (LazyCollection<Integer>) output;
    assertTrue(lOutput.getDeferredOp() instanceof ParallelDo);
    // Execute and assert the result before optimizing
    executeAndAssert((LazyCollection<Integer>) output, new Integer[] { 2, 3, 4, 5, 6, 7 });
    // Get an Optimizer
    Optimizer optimizer = new Optimizer();
    optimizer.sinkFlattens(output);
    // Execute and assert the result after optimizing
    executeAndAssert((LazyCollection<Integer>) output, new Integer[] { 2, 3, 4, 5, 6, 7 });
    // Check that optimizer did what it's supposed to do
    assertTrue(lOutput.getDeferredOp() instanceof Flatten);
    Flatten flatten = (Flatten) lOutput.getDeferredOp();
    assertEquals(flatten.getOrigins().size(), 2);
    for (int i = 0; i < 2; i++) {
        LazyCollection<Integer> origin = (LazyCollection<Integer>) flatten.getOrigins().get(i);
        ParallelDo newPDo = (ParallelDo) origin.getDeferredOp();
        assertEquals(newPDo.getFunction(), plusOne);
        assertTrue(newPDo.getOrigin() == input1 || newPDo.getOrigin() == input2);
    }
}
Also used : ParallelDo(com.tdunning.plume.local.lazy.op.ParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) Flatten(com.tdunning.plume.local.lazy.op.Flatten) Test(org.junit.Test)

Aggregations

ParallelDo (com.tdunning.plume.local.lazy.op.ParallelDo)9 Flatten (com.tdunning.plume.local.lazy.op.Flatten)8 MultipleParallelDo (com.tdunning.plume.local.lazy.op.MultipleParallelDo)8 DeferredOp (com.tdunning.plume.local.lazy.op.DeferredOp)7 PCollection (com.tdunning.plume.PCollection)5 OneToOneOp (com.tdunning.plume.local.lazy.op.OneToOneOp)5 ArrayList (java.util.ArrayList)4 DoFn (com.tdunning.plume.DoFn)3 EmitFn (com.tdunning.plume.EmitFn)3 GroupByKey (com.tdunning.plume.local.lazy.op.GroupByKey)3 Pair (com.tdunning.plume.Pair)2 CombineValues (com.tdunning.plume.local.lazy.op.CombineValues)2 Map (java.util.Map)2 PTable (com.tdunning.plume.PTable)1 OutputChannel (com.tdunning.plume.local.lazy.MSCR.OutputChannel)1 PlumeObject (com.tdunning.plume.local.lazy.MapRedExecutor.PlumeObject)1 IOException (java.io.IOException)1 HashSet (java.util.HashSet)1 Iterator (java.util.Iterator)1 LinkedList (java.util.LinkedList)1