Search in sources :

Example 1 with DoFn

use of com.tdunning.plume.DoFn in project Plume by tdunning.

the class MSCRMapper method map.

@SuppressWarnings("unchecked")
protected void map(WritableComparable key, WritableComparable value, final Mapper<WritableComparable, WritableComparable, PlumeObject, PlumeObject>.Context<WritableComparable, WritableComparable, PlumeObject, PlumeObject> context) throws IOException, InterruptedException {
    LazyCollection<?> l = null;
    FileSplit fS = FileInputSplitWrapper.getFileInputSplit(context);
    // Get LazyCollection for this input (according to FileSplit)
    for (PCollection<?> input : mscr.getInputs()) {
        LazyCollection<?> thisL = (LazyCollection<?>) input;
        if (thisL.getFile() == null) {
            // Convention for intermediate results
            thisL.setFile(tmpFolder + "/" + thisL.getPlumeId());
        }
        if (fS.getPath().toString().startsWith(thisL.getFile()) || fS.getPath().toString().startsWith("file:" + thisL.getFile())) {
            l = thisL;
            break;
        }
    }
    if (l == null) {
        throw new RuntimeException("Unable to match input split with any MSCR input");
    }
    // If this collection is a table -> process Pair, otherwise process value
    PCollectionType type = l.getType();
    Object toProcess = value;
    if (type instanceof PTableType) {
        toProcess = Pair.create(key, value);
    }
    for (DeferredOp op : l.getDownOps()) {
        if (op instanceof MultipleParallelDo) {
            MultipleParallelDo mPDo = ((MultipleParallelDo) op);
            for (Object entry : mPDo.getDests().entrySet()) {
                Map.Entry<PCollection, DoFn> en = (Map.Entry<PCollection, DoFn>) entry;
                LazyCollection<?> lCol = (LazyCollection<?>) en.getKey();
                DeferredOp childOp = null;
                if (lCol.getDownOps() != null && lCol.getDownOps().size() > 0) {
                    childOp = lCol.getDownOps().get(0);
                }
                final Integer channel;
                if (childOp != null && childOp instanceof Flatten) {
                    channel = mscr.getNumberedChannels().get(((Flatten) childOp).getDest());
                } else if (childOp != null && childOp instanceof GroupByKey) {
                    channel = mscr.getNumberedChannels().get(((GroupByKey) childOp).getOrigin());
                } else {
                    // bypass channel?
                    channel = mscr.getNumberedChannels().get(en.getKey());
                }
                if (channel == null) {
                    // This is not for this MSCR - just skip it
                    return;
                }
                // Call parallelDo function
                en.getValue().process(toProcess, new EmitFn() {

                    @Override
                    public void emit(Object v) {
                        try {
                            if (v instanceof Pair) {
                                Pair p = (Pair) v;
                                context.write(new PlumeObject((WritableComparable) p.getKey(), channel), new PlumeObject((WritableComparable) p.getValue(), channel));
                            } else {
                                context.write(new PlumeObject((WritableComparable) v, channel), new PlumeObject((WritableComparable) v, channel));
                            }
                        } catch (Exception e) {
                            // TODO How to report this
                            e.printStackTrace();
                        }
                    }
                });
            }
        } else {
            if (op instanceof Flatten) {
                l = (LazyCollection) ((Flatten) op).getDest();
            }
            int channel = mscr.getNumberedChannels().get(l);
            if (toProcess instanceof Pair) {
                context.write(new PlumeObject(key, channel), new PlumeObject(value, channel));
            } else {
                context.write(new PlumeObject(value, channel), new PlumeObject(value, channel));
            }
        }
    }
}
Also used : MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) GroupByKey(com.tdunning.plume.local.lazy.op.GroupByKey) PlumeObject(com.tdunning.plume.local.lazy.MapRedExecutor.PlumeObject) PTableType(com.tdunning.plume.types.PTableType) Flatten(com.tdunning.plume.local.lazy.op.Flatten) PCollectionType(com.tdunning.plume.types.PCollectionType) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) DeferredOp(com.tdunning.plume.local.lazy.op.DeferredOp) IOException(java.io.IOException) PCollection(com.tdunning.plume.PCollection) DoFn(com.tdunning.plume.DoFn) EmitFn(com.tdunning.plume.EmitFn) PlumeObject(com.tdunning.plume.local.lazy.MapRedExecutor.PlumeObject) Map(java.util.Map) Pair(com.tdunning.plume.Pair)

Example 2 with DoFn

use of com.tdunning.plume.DoFn in project Plume by tdunning.

the class MSCRReducer method reduce.

@SuppressWarnings("unchecked")
protected void reduce(final PlumeObject arg0, java.lang.Iterable<PlumeObject> values, Reducer<PlumeObject, PlumeObject, NullWritable, NullWritable>.Context<PlumeObject, PlumeObject, NullWritable, NullWritable> arg2) throws IOException, InterruptedException {
    PCollection col = mscr.getChannelByNumber().get(arg0.sourceId);
    OutputChannel oC = mscr.getOutputChannels().get(col);
    if (oC.reducer != null) {
        // apply reducer
        ParallelDo pDo = oC.reducer;
        // TODO how to check / report this
        DoFn reducer = pDo.getFunction();
        List<WritableComparable> vals = Lists.newArrayList();
        for (PlumeObject val : values) {
            vals.add(val.obj);
        }
        reducer.process(Pair.create(arg0.obj, vals), new EmitFn() {

            @Override
            public void emit(Object v) {
                try {
                    if (v instanceof Pair) {
                        Pair p = (Pair) v;
                        mos.write(arg0.sourceId + "", p.getKey(), p.getValue());
                    } else {
                        mos.write(arg0.sourceId + "", NullWritable.get(), (WritableComparable) v);
                    }
                } catch (Exception e) {
                    // TODO How to report this
                    e.printStackTrace();
                }
            }
        });
    } else {
        // direct writing - write all key, value pairs
        for (PlumeObject val : values) {
            if (oC.output instanceof PTable) {
                mos.write(arg0.sourceId + "", arg0.obj, val.obj);
            } else {
                mos.write(arg0.sourceId + "", NullWritable.get(), val.obj);
            }
        }
    }
}
Also used : ParallelDo(com.tdunning.plume.local.lazy.op.ParallelDo) PlumeObject(com.tdunning.plume.local.lazy.MapRedExecutor.PlumeObject) IOException(java.io.IOException) PTable(com.tdunning.plume.PTable) PCollection(com.tdunning.plume.PCollection) DoFn(com.tdunning.plume.DoFn) EmitFn(com.tdunning.plume.EmitFn) WritableComparable(org.apache.hadoop.io.WritableComparable) OutputChannel(com.tdunning.plume.local.lazy.MSCR.OutputChannel) PlumeObject(com.tdunning.plume.local.lazy.MapRedExecutor.PlumeObject) Pair(com.tdunning.plume.Pair)

Example 3 with DoFn

use of com.tdunning.plume.DoFn in project Plume by tdunning.

the class BasicOptimizerTest method testParallelDoSiblingFusion.

@SuppressWarnings("unchecked")
@Test
public void testParallelDoSiblingFusion() {
    // Get Plume runtime
    LazyPlume plume = new LazyPlume();
    // Create simple data
    PCollection<Integer> input = plume.fromJava(Lists.newArrayList(1, 2, 3));
    PCollection<Integer> output1 = input.map(plusOne, null);
    PCollection<Integer> output2 = input.map(timesTwo, null);
    LazyCollection<Integer> lInput = (LazyCollection<Integer>) input;
    LazyCollection<Integer> lOutput1 = (LazyCollection<Integer>) output1;
    LazyCollection<Integer> lOutput2 = (LazyCollection<Integer>) output2;
    assertEquals(lInput.downOps.size(), 2);
    // Execute and assert the result before optimizing
    executeAndAssert(lOutput1, new Integer[] { 2, 3, 4 });
    executeAndAssert(lOutput2, new Integer[] { 2, 4, 6 });
    // Get an Optimizer
    Optimizer optimizer = new Optimizer();
    // one output is enough to fuse both because they share the parent
    optimizer.fuseSiblingParallelDos(output1);
    // Check that input child ops has shrinked to 1
    assertEquals(lInput.downOps.size(), 1);
    DeferredOp op = lInput.downOps.get(0);
    // Check that there is only one op pointing to both outputs
    assertEquals(op, lOutput1.deferredOp);
    assertEquals(op, lOutput2.deferredOp);
    assertTrue(op instanceof MultipleParallelDo);
    MultipleParallelDo<Integer> mPDo = (MultipleParallelDo<Integer>) op;
    Map<PCollection<?>, DoFn<Integer, ?>> mapOfPDos = mPDo.getDests();
    // Check that the map of functions in MultipleParallelDo is correct
    assertEquals(mapOfPDos.get(output1), plusOne);
    assertEquals(mapOfPDos.get(output2), timesTwo);
    // Execute and assert the result afer optimizing
    executeAndAssert(lOutput1, new Integer[] { 2, 3, 4 });
    executeAndAssert(lOutput2, new Integer[] { 2, 4, 6 });
}
Also used : PCollection(com.tdunning.plume.PCollection) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) DoFn(com.tdunning.plume.DoFn) DeferredOp(com.tdunning.plume.local.lazy.op.DeferredOp) Test(org.junit.Test)

Example 4 with DoFn

use of com.tdunning.plume.DoFn in project Plume by tdunning.

the class MSCRMapper method map.

@SuppressWarnings("unchecked")
protected void map(WritableComparable key, WritableComparable value, final Mapper<WritableComparable, WritableComparable, PlumeObject, PlumeObject>.Context context) throws IOException, InterruptedException {
    LazyCollection<?> l = null;
    FileSplit fS = FileInputSplitWrapper.getFileInputSplit(context);
    // Get LazyCollection for this input (according to FileSplit)
    for (PCollection<?> input : mscr.getInputs()) {
        LazyCollection<?> thisL = (LazyCollection<?>) input;
        if (thisL.getFile() == null) {
            // Convention for intermediate results
            thisL.setFile(tmpFolder + "/" + thisL.getPlumeId());
        }
        if (fS.getPath().toString().startsWith(thisL.getFile()) || fS.getPath().toString().startsWith("file:" + thisL.getFile())) {
            l = thisL;
            break;
        }
    }
    if (l == null) {
        throw new RuntimeException("Unable to match input split with any MSCR input");
    }
    // If this collection is a table -> process Pair, otherwise process value
    PCollectionType type = l.getType();
    Object toProcess = value;
    if (type instanceof PTableType) {
        toProcess = Pair.create(key, value);
    }
    for (DeferredOp op : l.getDownOps()) {
        if (op instanceof MultipleParallelDo) {
            MultipleParallelDo mPDo = ((MultipleParallelDo) op);
            for (Object entry : mPDo.getDests().entrySet()) {
                Map.Entry<PCollection, DoFn> en = (Map.Entry<PCollection, DoFn>) entry;
                LazyCollection<?> lCol = (LazyCollection<?>) en.getKey();
                DeferredOp childOp = null;
                if (lCol.getDownOps() != null && lCol.getDownOps().size() > 0) {
                    childOp = lCol.getDownOps().get(0);
                }
                final Integer channel;
                if (childOp != null && childOp instanceof Flatten) {
                    channel = mscr.getNumberedChannels().get(((Flatten) childOp).getDest());
                } else if (childOp != null && childOp instanceof GroupByKey) {
                    channel = mscr.getNumberedChannels().get(((GroupByKey) childOp).getOrigin());
                } else {
                    // bypass channel?
                    channel = mscr.getNumberedChannels().get(en.getKey());
                }
                if (channel == null) {
                    // This is not for this MSCR - just skip it
                    return;
                }
                // Call parallelDo function
                en.getValue().process(toProcess, new EmitFn() {

                    @Override
                    public void emit(Object v) {
                        try {
                            if (v instanceof Pair) {
                                Pair p = (Pair) v;
                                context.write(new PlumeObject((WritableComparable) p.getKey(), channel), new PlumeObject((WritableComparable) p.getValue(), channel));
                            } else {
                                context.write(new PlumeObject((WritableComparable) v, channel), new PlumeObject((WritableComparable) v, channel));
                            }
                        } catch (Exception e) {
                            // TODO How to report this
                            e.printStackTrace();
                        }
                    }
                });
            }
        } else {
            if (op instanceof Flatten) {
                l = (LazyCollection) ((Flatten) op).getDest();
            }
            int channel = mscr.getNumberedChannels().get(l);
            if (toProcess instanceof Pair) {
                context.write(new PlumeObject(key, channel), new PlumeObject(value, channel));
            } else {
                context.write(new PlumeObject(value, channel), new PlumeObject(value, channel));
            }
        }
    }
}
Also used : MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) GroupByKey(com.tdunning.plume.local.lazy.op.GroupByKey) PlumeObject(com.tdunning.plume.local.lazy.MapRedExecutor.PlumeObject) PTableType(com.tdunning.plume.types.PTableType) Flatten(com.tdunning.plume.local.lazy.op.Flatten) PCollectionType(com.tdunning.plume.types.PCollectionType) FileSplit(org.apache.hadoop.mapreduce.lib.input.FileSplit) DeferredOp(com.tdunning.plume.local.lazy.op.DeferredOp) IOException(java.io.IOException) PCollection(com.tdunning.plume.PCollection) DoFn(com.tdunning.plume.DoFn) EmitFn(com.tdunning.plume.EmitFn) PlumeObject(com.tdunning.plume.local.lazy.MapRedExecutor.PlumeObject) Map(java.util.Map) Pair(com.tdunning.plume.Pair)

Example 5 with DoFn

use of com.tdunning.plume.DoFn in project Plume by tdunning.

the class LocalExecutor method execute.

/**
 * Execute one-output flow
 *
 * @param <T>
 * @param output
 * @return
 */
@SuppressWarnings({ "unchecked", "rawtypes" })
public <T> Iterable<T> execute(LazyCollection<T> output) {
    if (output.isMaterialized()) {
        // nothing else to execute
        return output.getData();
    } else {
        DeferredOp op = output.getDeferredOp();
        final List<T> result = Lists.newArrayList();
        // Flatten op
        if (op instanceof Flatten) {
            Flatten<T> flatten = (Flatten<T>) op;
            for (PCollection<T> col : flatten.getOrigins()) {
                Iterable<T> res = execute((LazyCollection<T>) col);
                result.addAll(Lists.newArrayList(res));
            }
            // done with it
            return result;
        }
        Iterable parent;
        EmitFn<T> emitter = new EmitFn<T>() {

            @Override
            public void emit(T v) {
                result.add(v);
            }
        };
        // ParallelDo
        if (op instanceof ParallelDo) {
            ParallelDo pDo = (ParallelDo) op;
            parent = execute((LazyCollection) pDo.getOrigin());
            for (Object obj : parent) {
                pDo.getFunction().process(obj, emitter);
            }
        // MultipleParallelDo -> parallel operations that read the same collection
        // In this version of executor, we will only compute the current collection, not its neighbors
        } else if (op instanceof MultipleParallelDo) {
            MultipleParallelDo mPDo = (MultipleParallelDo) op;
            parent = execute((LazyCollection) mPDo.getOrigin());
            // get the function that corresponds to this collection
            DoFn function = (DoFn) mPDo.getDests().get(output);
            for (Object obj : parent) {
                function.process(obj, emitter);
            }
        // GroupByKey
        } else if (op instanceof GroupByKey) {
            GroupByKey gBK = (GroupByKey) op;
            parent = execute(gBK.getOrigin());
            Map<Object, List> groupMap = Maps.newHashMap();
            // Perform in-memory group by operation
            for (Object obj : parent) {
                Pair p = (Pair) obj;
                List list = groupMap.get(p.getKey());
                if (list == null) {
                    list = new ArrayList();
                }
                list.add(p.getValue());
                groupMap.put(p.getKey(), list);
            }
            for (Map.Entry<Object, List> entry : groupMap.entrySet()) {
                result.add((T) new Pair(entry.getKey(), entry.getValue()));
            }
        }
        return result;
    }
}
Also used : ParallelDo(com.tdunning.plume.local.lazy.op.ParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) MultipleParallelDo(com.tdunning.plume.local.lazy.op.MultipleParallelDo) GroupByKey(com.tdunning.plume.local.lazy.op.GroupByKey) Flatten(com.tdunning.plume.local.lazy.op.Flatten) ArrayList(java.util.ArrayList) DeferredOp(com.tdunning.plume.local.lazy.op.DeferredOp) DoFn(com.tdunning.plume.DoFn) EmitFn(com.tdunning.plume.EmitFn) ArrayList(java.util.ArrayList) List(java.util.List) Map(java.util.Map) Pair(com.tdunning.plume.Pair)

Aggregations

DoFn (com.tdunning.plume.DoFn)7 EmitFn (com.tdunning.plume.EmitFn)6 PCollection (com.tdunning.plume.PCollection)5 Pair (com.tdunning.plume.Pair)5 DeferredOp (com.tdunning.plume.local.lazy.op.DeferredOp)5 MultipleParallelDo (com.tdunning.plume.local.lazy.op.MultipleParallelDo)5 PlumeObject (com.tdunning.plume.local.lazy.MapRedExecutor.PlumeObject)4 Flatten (com.tdunning.plume.local.lazy.op.Flatten)4 GroupByKey (com.tdunning.plume.local.lazy.op.GroupByKey)4 ParallelDo (com.tdunning.plume.local.lazy.op.ParallelDo)4 IOException (java.io.IOException)4 Map (java.util.Map)3 PTable (com.tdunning.plume.PTable)2 OutputChannel (com.tdunning.plume.local.lazy.MSCR.OutputChannel)2 PCollectionType (com.tdunning.plume.types.PCollectionType)2 PTableType (com.tdunning.plume.types.PTableType)2 WritableComparable (org.apache.hadoop.io.WritableComparable)2 FileSplit (org.apache.hadoop.mapreduce.lib.input.FileSplit)2 CombineValues (com.tdunning.plume.local.lazy.op.CombineValues)1 OneToOneOp (com.tdunning.plume.local.lazy.op.OneToOneOp)1