Search in sources :

Example 6 with Flatten

use of com.tdunning.plume.local.lazy.op.Flatten in project Plume by tdunning.

the class Optimizer method sinkFlattens.

/**
 * Sink flattens pushing them down to create opportunities for ParallelDo fusion
 * @param arg  The collection that may contain flatten operations that we need to sink.
 */
@SuppressWarnings({ "unchecked", "rawtypes" })
<T> void sinkFlattens(PCollection<T> arg) {
    LazyCollection<T> output = (LazyCollection<T>) arg;
    if (output.isMaterialized()) {
        // stop condition for recursive algorithm
        return;
    }
    DeferredOp dOp = output.getDeferredOp();
    if (!(dOp instanceof Flatten)) {
        if (dOp instanceof OneToOneOp) {
            // Recursively apply this function to parent
            sinkFlattens(((OneToOneOp) dOp).getOrigin());
            return;
        } else if (dOp instanceof ParallelDo) {
            // Recursively apply this function to parent
            sinkFlattens(((ParallelDo) dOp).getOrigin());
            return;
        }
    }
    if (output.getDownOps() == null || output.getDownOps().size() != 1) {
        // Recursively apply this function to parent
        for (Object col : ((Flatten) dOp).getOrigins()) {
            sinkFlattens((PCollection) col);
        }
        return;
    }
    DeferredOp downOp = output.getDownOps().get(0);
    if (!(downOp instanceof ParallelDo)) {
        return;
    }
    // PDo below current node
    ParallelDo<T, ?> op = (ParallelDo<T, ?>) downOp;
    // Flatten above current node
    Flatten<T> flatten = (Flatten<T>) dOp;
    List<PCollection<?>> newOrigins = new ArrayList<PCollection<?>>();
    // Iterate over all Flatten inputs
    for (PCollection<T> col : flatten.getOrigins()) {
        // Recursively apply this function to this flatten's origin
        LazyCollection<T> fInput = (LazyCollection<T>) col;
        sinkFlattens(fInput);
        // Sink
        LazyCollection<?> newInput = new LazyCollection();
        newInput.deferredOp = new ParallelDo(op.getFunction(), fInput, newInput);
        newInput.type = ((LazyCollection) flatten.getDest()).getType();
        fInput.downOps.remove(0);
        fInput.addDownOp(newInput.deferredOp);
        newOrigins.add(newInput);
    }
    Flatten<?> newFlatten = new Flatten(newOrigins, op.getDest());
    ((LazyCollection<?>) op.getDest()).deferredOp = newFlatten;
    for (PCollection<?> newOp : newOrigins) {
        ((LazyCollection<?>) newOp).addDownOp(newFlatten);
    }
}
Also used : ParallelDo(com.tdunning.plume.local.lazy.op.ParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) Flatten(com.tdunning.plume.local.lazy.op.Flatten) ArrayList(java.util.ArrayList) OneToOneOp(com.tdunning.plume.local.lazy.op.OneToOneOp) DeferredOp(com.tdunning.plume.local.lazy.op.DeferredOp) PCollection(com.tdunning.plume.PCollection)

Example 7 with Flatten

use of com.tdunning.plume.local.lazy.op.Flatten in project Plume by tdunning.

the class Optimizer method removeUnnecessaryOps.

/**
 * Removes unnecesary operations that are not removed by the Optimizer. It goes top-down (receives an Input).
 * Returns true if passed node doesn't lead to an output.
 */
boolean removeUnnecessaryOps(PCollection arg, List<PCollection> outputs) {
    LazyCollection<?> input = (LazyCollection) arg;
    if (input.getDownOps() == null || input.getDownOps().size() == 0) {
        // Leaf node
        return !outputs.contains(input);
    }
    // create new list of deferredops that are 'usefull'
    List<DeferredOp> finalDOps = new ArrayList<DeferredOp>();
    for (DeferredOp op : input.getDownOps()) {
        boolean remove = false;
        if (op instanceof OneToOneOp) {
            remove = removeUnnecessaryOps(((OneToOneOp<?, ?>) op).getDest(), outputs);
        } else if (op instanceof ParallelDo) {
            remove = removeUnnecessaryOps(((ParallelDo<?, ?>) op).getDest(), outputs);
        } else if (op instanceof Flatten) {
            remove = removeUnnecessaryOps(((Flatten<?>) op).getDest(), outputs);
        } else if (op instanceof MultipleParallelDo) {
            MultipleParallelDo<?> mPDo = (MultipleParallelDo<?>) op;
            // begin with 1 because we will apply an AND gate with the childs
            remove = true;
            for (Object entry : mPDo.getDests().entrySet()) {
                PCollection<?> pCol = (PCollection<?>) ((Map.Entry) entry).getKey();
                remove = remove & removeUnnecessaryOps(pCol, outputs);
            }
        }
        if (!remove) {
            finalDOps.add(op);
        }
    }
    input.downOps = finalDOps;
    // if true this node can also be removed
    return finalDOps.size() == 0;
}
Also used : ParallelDo(com.tdunning.plume.local.lazy.op.ParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) ArrayList(java.util.ArrayList) Flatten(com.tdunning.plume.local.lazy.op.Flatten) OneToOneOp(com.tdunning.plume.local.lazy.op.OneToOneOp) DeferredOp(com.tdunning.plume.local.lazy.op.DeferredOp) PCollection(com.tdunning.plume.PCollection)

Example 8 with Flatten

use of com.tdunning.plume.local.lazy.op.Flatten in project Plume by tdunning.

the class LocalExecutor method execute.

/**
 * Execute one-output flow
 *
 * @param <T>
 * @param output
 * @return
 */
@SuppressWarnings({ "unchecked", "rawtypes" })
public <T> Iterable<T> execute(LazyCollection<T> output) {
    if (output.isMaterialized()) {
        // nothing else to execute
        return output.getData();
    } else {
        DeferredOp op = output.getDeferredOp();
        final List<T> result = Lists.newArrayList();
        // Flatten op
        if (op instanceof Flatten) {
            Flatten<T> flatten = (Flatten<T>) op;
            for (PCollection<T> col : flatten.getOrigins()) {
                Iterable<T> res = execute((LazyCollection<T>) col);
                result.addAll(Lists.newArrayList(res));
            }
            // done with it
            return result;
        }
        Iterable parent;
        EmitFn<T> emitter = new EmitFn<T>() {

            @Override
            public void emit(T v) {
                result.add(v);
            }
        };
        // ParallelDo
        if (op instanceof ParallelDo) {
            ParallelDo pDo = (ParallelDo) op;
            parent = execute((LazyCollection) pDo.getOrigin());
            for (Object obj : parent) {
                pDo.getFunction().process(obj, emitter);
            }
        // MultipleParallelDo -> parallel operations that read the same collection
        // In this version of executor, we will only compute the current collection, not its neighbors
        } else if (op instanceof MultipleParallelDo) {
            MultipleParallelDo mPDo = (MultipleParallelDo) op;
            parent = execute((LazyCollection) mPDo.getOrigin());
            // get the function that corresponds to this collection
            DoFn function = (DoFn) mPDo.getDests().get(output);
            for (Object obj : parent) {
                function.process(obj, emitter);
            }
        // GroupByKey
        } else if (op instanceof GroupByKey) {
            GroupByKey gBK = (GroupByKey) op;
            parent = execute(gBK.getOrigin());
            Map<Object, List> groupMap = Maps.newHashMap();
            // Perform in-memory group by operation
            for (Object obj : parent) {
                Pair p = (Pair) obj;
                List list = groupMap.get(p.getKey());
                if (list == null) {
                    list = new ArrayList();
                }
                list.add(p.getValue());
                groupMap.put(p.getKey(), list);
            }
            for (Map.Entry<Object, List> entry : groupMap.entrySet()) {
                result.add((T) new Pair(entry.getKey(), entry.getValue()));
            }
        }
        return result;
    }
}
Also used : ParallelDo(com.tdunning.plume.local.lazy.op.ParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) GroupByKey(com.tdunning.plume.local.lazy.op.GroupByKey) Flatten(com.tdunning.plume.local.lazy.op.Flatten) ArrayList(java.util.ArrayList) DeferredOp(com.tdunning.plume.local.lazy.op.DeferredOp) DoFn(com.tdunning.plume.DoFn) EmitFn(com.tdunning.plume.EmitFn) ArrayList(java.util.ArrayList) List(java.util.List) Map(java.util.Map) Pair(com.tdunning.plume.Pair)

Example 9 with Flatten

use of com.tdunning.plume.local.lazy.op.Flatten in project Plume by tdunning.

the class OptimizerTools method getAll.

static List<DeferredOp> getAll(PCollection<?> output, Class<? extends DeferredOp> getClass) {
    List<DeferredOp> retOps = new ArrayList<DeferredOp>();
    Stack<LazyCollection<?>> toVisit = new Stack<LazyCollection<?>>();
    Set<LazyCollection<?>> visited = new HashSet<LazyCollection<?>>();
    toVisit.push((LazyCollection<?>) output);
    while (!toVisit.isEmpty()) {
        LazyCollection<?> current = toVisit.pop();
        visited.add(current);
        if (current.isMaterialized()) {
            continue;
        }
        DeferredOp op = current.getDeferredOp();
        if (op.getClass().equals(getClass)) {
            // Found
            if (!retOps.contains(op)) {
                retOps.add(op);
            }
        }
        // Add more nodes to visit
        List<DeferredOp> ops = Lists.newArrayList();
        ops.add(op);
        for (DeferredOp o : ops) {
            if (o instanceof Flatten) {
                for (PCollection<?> input : ((Flatten<?>) o).getOrigins()) {
                    LazyCollection<?> in = (LazyCollection<?>) input;
                    if (!visited.contains(in)) {
                        toVisit.push(in);
                    }
                }
                continue;
            }
            if (o instanceof OneToOneOp) {
                LazyCollection<?> input = (LazyCollection<?>) ((OneToOneOp<?, ?>) o).getOrigin();
                if (!visited.contains(input)) {
                    toVisit.push(input);
                }
                continue;
            }
            if (o instanceof MultipleParallelDo) {
                MultipleParallelDo<?> mPDo = (MultipleParallelDo<?>) o;
                LazyCollection<?> input = (LazyCollection<?>) mPDo.getOrigin();
                if (!visited.contains(input)) {
                    toVisit.push(input);
                }
            }
        }
    }
    return retOps;
}
Also used : MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) ArrayList(java.util.ArrayList) Flatten(com.tdunning.plume.local.lazy.op.Flatten) OneToOneOp(com.tdunning.plume.local.lazy.op.OneToOneOp) DeferredOp(com.tdunning.plume.local.lazy.op.DeferredOp) Stack(java.util.Stack) HashSet(java.util.HashSet)

Example 10 with Flatten

use of com.tdunning.plume.local.lazy.op.Flatten in project Plume by tdunning.

the class Optimizer method fuseParallelDos.

/**
 * Fuse producer-consumer ParallelDos as in : {Orig2 => p2 => Orig1 => p1 => Output} to {Orig2 => p1(p2) => Output}
 * @param arg  The collection that may have compositions internally.
 */
@SuppressWarnings({ "unchecked", "rawtypes" })
<T> void fuseParallelDos(PCollection<T> arg) {
    LazyCollection<T> output = (LazyCollection<T>) arg;
    if (output.isMaterialized()) {
        // stop condition for recursive algorithm
        return;
    }
    DeferredOp dOp = output.getDeferredOp();
    if (!(dOp instanceof ParallelDo)) {
        // not a ParallelDo
        if (dOp instanceof OneToOneOp) {
            // Recursively apply this function to parent
            fuseParallelDos(((OneToOneOp) dOp).getOrigin());
            return;
        }
        if (dOp instanceof Flatten) {
            Flatten<T> flatten = (Flatten) dOp;
            // Recursively apply this function to all parents
            for (PCollection<T> col : flatten.getOrigins()) {
                fuseParallelDos(col);
            }
            return;
        }
    }
    ParallelDo p1 = (ParallelDo) output.getDeferredOp();
    LazyCollection orig1 = (LazyCollection) p1.getOrigin();
    if (orig1.isMaterialized()) {
        return;
    }
    if (!(orig1.getDeferredOp() instanceof ParallelDo)) {
        // Recursively apply this function to parent node
        fuseParallelDos(orig1);
        return;
    }
    // At this point we know ParallelDo fusion can be done -> Perform it
    ParallelDo p2 = (ParallelDo) orig1.getDeferredOp();
    // Lift combine values
    if (p2 instanceof CombineValues) {
        LazyCollection lCol = (LazyCollection) p2.getOrigin();
        if (!lCol.isMaterialized() && lCol.getDeferredOp() instanceof GroupByKey) {
            // Upper parallel do is CombineValues and follows a GroupByKey -> don't join
            fuseParallelDos(orig1);
            return;
        }
    }
    final DoFn f1 = p1.getFunction();
    final DoFn f2 = p2.getFunction();
    // Define the joined function
    DoFn newFn = new DoFn() {

        @Override
        public void process(Object v, final EmitFn emitter) {
            f2.process(v, new EmitFn() {

                @Override
                public void emit(Object v) {
                    f1.process(v, emitter);
                }
            });
        }
    };
    LazyCollection orig2 = (LazyCollection) p2.getOrigin();
    ParallelDo newPDo = new ParallelDo(newFn, orig2, output);
    // Clean & change pointers
    orig2.downOps.remove(p2);
    orig1.downOps.remove(p1);
    orig2.addDownOp(newPDo);
    output.deferredOp = newPDo;
    // Recursively apply this function to the same node => TODO Beware infinite recursion, properly test
    fuseParallelDos(output);
}
Also used : CombineValues(com.tdunning.plume.local.lazy.op.CombineValues) ParallelDo(com.tdunning.plume.local.lazy.op.ParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) GroupByKey(com.tdunning.plume.local.lazy.op.GroupByKey) Flatten(com.tdunning.plume.local.lazy.op.Flatten) OneToOneOp(com.tdunning.plume.local.lazy.op.OneToOneOp) DeferredOp(com.tdunning.plume.local.lazy.op.DeferredOp) DoFn(com.tdunning.plume.DoFn) EmitFn(com.tdunning.plume.EmitFn)

Aggregations

Flatten (com.tdunning.plume.local.lazy.op.Flatten)11 MultipleParallelDo (com.tdunning.plume.local.lazy.op.MultipleParallelDo)11 DeferredOp (com.tdunning.plume.local.lazy.op.DeferredOp)10 ParallelDo (com.tdunning.plume.local.lazy.op.ParallelDo)8 PCollection (com.tdunning.plume.PCollection)6 OneToOneOp (com.tdunning.plume.local.lazy.op.OneToOneOp)6 GroupByKey (com.tdunning.plume.local.lazy.op.GroupByKey)5 ArrayList (java.util.ArrayList)5 DoFn (com.tdunning.plume.DoFn)4 EmitFn (com.tdunning.plume.EmitFn)4 Map (java.util.Map)4 Pair (com.tdunning.plume.Pair)3 PlumeObject (com.tdunning.plume.local.lazy.MapRedExecutor.PlumeObject)2 CombineValues (com.tdunning.plume.local.lazy.op.CombineValues)2 PCollectionType (com.tdunning.plume.types.PCollectionType)2 PTableType (com.tdunning.plume.types.PTableType)2 IOException (java.io.IOException)2 HashSet (java.util.HashSet)2 Stack (java.util.Stack)2 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)2