use of com.tdunning.plume.DoFn in project Plume by tdunning.
the class MSCRMapper method map.
@SuppressWarnings("unchecked")
protected void map(WritableComparable key, WritableComparable value, final Mapper<WritableComparable, WritableComparable, PlumeObject, PlumeObject>.Context<WritableComparable, WritableComparable, PlumeObject, PlumeObject> context) throws IOException, InterruptedException {
LazyCollection<?> l = null;
FileSplit fS = FileInputSplitWrapper.getFileInputSplit(context);
// Get LazyCollection for this input (according to FileSplit)
for (PCollection<?> input : mscr.getInputs()) {
LazyCollection<?> thisL = (LazyCollection<?>) input;
if (thisL.getFile() == null) {
// Convention for intermediate results
thisL.setFile(tmpFolder + "/" + thisL.getPlumeId());
}
if (fS.getPath().toString().startsWith(thisL.getFile()) || fS.getPath().toString().startsWith("file:" + thisL.getFile())) {
l = thisL;
break;
}
}
if (l == null) {
throw new RuntimeException("Unable to match input split with any MSCR input");
}
// If this collection is a table -> process Pair, otherwise process value
PCollectionType type = l.getType();
Object toProcess = value;
if (type instanceof PTableType) {
toProcess = Pair.create(key, value);
}
for (DeferredOp op : l.getDownOps()) {
if (op instanceof MultipleParallelDo) {
MultipleParallelDo mPDo = ((MultipleParallelDo) op);
for (Object entry : mPDo.getDests().entrySet()) {
Map.Entry<PCollection, DoFn> en = (Map.Entry<PCollection, DoFn>) entry;
LazyCollection<?> lCol = (LazyCollection<?>) en.getKey();
DeferredOp childOp = null;
if (lCol.getDownOps() != null && lCol.getDownOps().size() > 0) {
childOp = lCol.getDownOps().get(0);
}
final Integer channel;
if (childOp != null && childOp instanceof Flatten) {
channel = mscr.getNumberedChannels().get(((Flatten) childOp).getDest());
} else if (childOp != null && childOp instanceof GroupByKey) {
channel = mscr.getNumberedChannels().get(((GroupByKey) childOp).getOrigin());
} else {
// bypass channel?
channel = mscr.getNumberedChannels().get(en.getKey());
}
if (channel == null) {
// This is not for this MSCR - just skip it
return;
}
// Call parallelDo function
en.getValue().process(toProcess, new EmitFn() {
@Override
public void emit(Object v) {
try {
if (v instanceof Pair) {
Pair p = (Pair) v;
context.write(new PlumeObject((WritableComparable) p.getKey(), channel), new PlumeObject((WritableComparable) p.getValue(), channel));
} else {
context.write(new PlumeObject((WritableComparable) v, channel), new PlumeObject((WritableComparable) v, channel));
}
} catch (Exception e) {
// TODO How to report this
e.printStackTrace();
}
}
});
}
} else {
if (op instanceof Flatten) {
l = (LazyCollection) ((Flatten) op).getDest();
}
int channel = mscr.getNumberedChannels().get(l);
if (toProcess instanceof Pair) {
context.write(new PlumeObject(key, channel), new PlumeObject(value, channel));
} else {
context.write(new PlumeObject(value, channel), new PlumeObject(value, channel));
}
}
}
}
use of com.tdunning.plume.DoFn in project Plume by tdunning.
the class MSCRReducer method reduce.
@SuppressWarnings("unchecked")
protected void reduce(final PlumeObject arg0, java.lang.Iterable<PlumeObject> values, Reducer<PlumeObject, PlumeObject, NullWritable, NullWritable>.Context<PlumeObject, PlumeObject, NullWritable, NullWritable> arg2) throws IOException, InterruptedException {
PCollection col = mscr.getChannelByNumber().get(arg0.sourceId);
OutputChannel oC = mscr.getOutputChannels().get(col);
if (oC.reducer != null) {
// apply reducer
ParallelDo pDo = oC.reducer;
// TODO how to check / report this
DoFn reducer = pDo.getFunction();
List<WritableComparable> vals = Lists.newArrayList();
for (PlumeObject val : values) {
vals.add(val.obj);
}
reducer.process(Pair.create(arg0.obj, vals), new EmitFn() {
@Override
public void emit(Object v) {
try {
if (v instanceof Pair) {
Pair p = (Pair) v;
mos.write(arg0.sourceId + "", p.getKey(), p.getValue());
} else {
mos.write(arg0.sourceId + "", NullWritable.get(), (WritableComparable) v);
}
} catch (Exception e) {
// TODO How to report this
e.printStackTrace();
}
}
});
} else {
// direct writing - write all key, value pairs
for (PlumeObject val : values) {
if (oC.output instanceof PTable) {
mos.write(arg0.sourceId + "", arg0.obj, val.obj);
} else {
mos.write(arg0.sourceId + "", NullWritable.get(), val.obj);
}
}
}
}
use of com.tdunning.plume.DoFn in project Plume by tdunning.
the class BasicOptimizerTest method testParallelDoSiblingFusion.
@SuppressWarnings("unchecked")
@Test
public void testParallelDoSiblingFusion() {
// Get Plume runtime
LazyPlume plume = new LazyPlume();
// Create simple data
PCollection<Integer> input = plume.fromJava(Lists.newArrayList(1, 2, 3));
PCollection<Integer> output1 = input.map(plusOne, null);
PCollection<Integer> output2 = input.map(timesTwo, null);
LazyCollection<Integer> lInput = (LazyCollection<Integer>) input;
LazyCollection<Integer> lOutput1 = (LazyCollection<Integer>) output1;
LazyCollection<Integer> lOutput2 = (LazyCollection<Integer>) output2;
assertEquals(lInput.downOps.size(), 2);
// Execute and assert the result before optimizing
executeAndAssert(lOutput1, new Integer[] { 2, 3, 4 });
executeAndAssert(lOutput2, new Integer[] { 2, 4, 6 });
// Get an Optimizer
Optimizer optimizer = new Optimizer();
// one output is enough to fuse both because they share the parent
optimizer.fuseSiblingParallelDos(output1);
// Check that input child ops has shrinked to 1
assertEquals(lInput.downOps.size(), 1);
DeferredOp op = lInput.downOps.get(0);
// Check that there is only one op pointing to both outputs
assertEquals(op, lOutput1.deferredOp);
assertEquals(op, lOutput2.deferredOp);
assertTrue(op instanceof MultipleParallelDo);
MultipleParallelDo<Integer> mPDo = (MultipleParallelDo<Integer>) op;
Map<PCollection<?>, DoFn<Integer, ?>> mapOfPDos = mPDo.getDests();
// Check that the map of functions in MultipleParallelDo is correct
assertEquals(mapOfPDos.get(output1), plusOne);
assertEquals(mapOfPDos.get(output2), timesTwo);
// Execute and assert the result afer optimizing
executeAndAssert(lOutput1, new Integer[] { 2, 3, 4 });
executeAndAssert(lOutput2, new Integer[] { 2, 4, 6 });
}
use of com.tdunning.plume.DoFn in project Plume by tdunning.
the class MSCRMapper method map.
@SuppressWarnings("unchecked")
protected void map(WritableComparable key, WritableComparable value, final Mapper<WritableComparable, WritableComparable, PlumeObject, PlumeObject>.Context context) throws IOException, InterruptedException {
LazyCollection<?> l = null;
FileSplit fS = FileInputSplitWrapper.getFileInputSplit(context);
// Get LazyCollection for this input (according to FileSplit)
for (PCollection<?> input : mscr.getInputs()) {
LazyCollection<?> thisL = (LazyCollection<?>) input;
if (thisL.getFile() == null) {
// Convention for intermediate results
thisL.setFile(tmpFolder + "/" + thisL.getPlumeId());
}
if (fS.getPath().toString().startsWith(thisL.getFile()) || fS.getPath().toString().startsWith("file:" + thisL.getFile())) {
l = thisL;
break;
}
}
if (l == null) {
throw new RuntimeException("Unable to match input split with any MSCR input");
}
// If this collection is a table -> process Pair, otherwise process value
PCollectionType type = l.getType();
Object toProcess = value;
if (type instanceof PTableType) {
toProcess = Pair.create(key, value);
}
for (DeferredOp op : l.getDownOps()) {
if (op instanceof MultipleParallelDo) {
MultipleParallelDo mPDo = ((MultipleParallelDo) op);
for (Object entry : mPDo.getDests().entrySet()) {
Map.Entry<PCollection, DoFn> en = (Map.Entry<PCollection, DoFn>) entry;
LazyCollection<?> lCol = (LazyCollection<?>) en.getKey();
DeferredOp childOp = null;
if (lCol.getDownOps() != null && lCol.getDownOps().size() > 0) {
childOp = lCol.getDownOps().get(0);
}
final Integer channel;
if (childOp != null && childOp instanceof Flatten) {
channel = mscr.getNumberedChannels().get(((Flatten) childOp).getDest());
} else if (childOp != null && childOp instanceof GroupByKey) {
channel = mscr.getNumberedChannels().get(((GroupByKey) childOp).getOrigin());
} else {
// bypass channel?
channel = mscr.getNumberedChannels().get(en.getKey());
}
if (channel == null) {
// This is not for this MSCR - just skip it
return;
}
// Call parallelDo function
en.getValue().process(toProcess, new EmitFn() {
@Override
public void emit(Object v) {
try {
if (v instanceof Pair) {
Pair p = (Pair) v;
context.write(new PlumeObject((WritableComparable) p.getKey(), channel), new PlumeObject((WritableComparable) p.getValue(), channel));
} else {
context.write(new PlumeObject((WritableComparable) v, channel), new PlumeObject((WritableComparable) v, channel));
}
} catch (Exception e) {
// TODO How to report this
e.printStackTrace();
}
}
});
}
} else {
if (op instanceof Flatten) {
l = (LazyCollection) ((Flatten) op).getDest();
}
int channel = mscr.getNumberedChannels().get(l);
if (toProcess instanceof Pair) {
context.write(new PlumeObject(key, channel), new PlumeObject(value, channel));
} else {
context.write(new PlumeObject(value, channel), new PlumeObject(value, channel));
}
}
}
}
use of com.tdunning.plume.DoFn in project Plume by tdunning.
the class LocalExecutor method execute.
/**
* Execute one-output flow
*
* @param <T>
* @param output
* @return
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
public <T> Iterable<T> execute(LazyCollection<T> output) {
if (output.isMaterialized()) {
// nothing else to execute
return output.getData();
} else {
DeferredOp op = output.getDeferredOp();
final List<T> result = Lists.newArrayList();
// Flatten op
if (op instanceof Flatten) {
Flatten<T> flatten = (Flatten<T>) op;
for (PCollection<T> col : flatten.getOrigins()) {
Iterable<T> res = execute((LazyCollection<T>) col);
result.addAll(Lists.newArrayList(res));
}
// done with it
return result;
}
Iterable parent;
EmitFn<T> emitter = new EmitFn<T>() {
@Override
public void emit(T v) {
result.add(v);
}
};
// ParallelDo
if (op instanceof ParallelDo) {
ParallelDo pDo = (ParallelDo) op;
parent = execute((LazyCollection) pDo.getOrigin());
for (Object obj : parent) {
pDo.getFunction().process(obj, emitter);
}
// MultipleParallelDo -> parallel operations that read the same collection
// In this version of executor, we will only compute the current collection, not its neighbors
} else if (op instanceof MultipleParallelDo) {
MultipleParallelDo mPDo = (MultipleParallelDo) op;
parent = execute((LazyCollection) mPDo.getOrigin());
// get the function that corresponds to this collection
DoFn function = (DoFn) mPDo.getDests().get(output);
for (Object obj : parent) {
function.process(obj, emitter);
}
// GroupByKey
} else if (op instanceof GroupByKey) {
GroupByKey gBK = (GroupByKey) op;
parent = execute(gBK.getOrigin());
Map<Object, List> groupMap = Maps.newHashMap();
// Perform in-memory group by operation
for (Object obj : parent) {
Pair p = (Pair) obj;
List list = groupMap.get(p.getKey());
if (list == null) {
list = new ArrayList();
}
list.add(p.getValue());
groupMap.put(p.getKey(), list);
}
for (Map.Entry<Object, List> entry : groupMap.entrySet()) {
result.add((T) new Pair(entry.getKey(), entry.getValue()));
}
}
return result;
}
}
Aggregations