use of com.tdunning.plume.local.lazy.op.GroupByKey in project Plume by tdunning.
the class MSCRMapper method map.
@SuppressWarnings("unchecked")
protected void map(WritableComparable key, WritableComparable value, final Mapper<WritableComparable, WritableComparable, PlumeObject, PlumeObject>.Context<WritableComparable, WritableComparable, PlumeObject, PlumeObject> context) throws IOException, InterruptedException {
LazyCollection<?> l = null;
FileSplit fS = FileInputSplitWrapper.getFileInputSplit(context);
// Get LazyCollection for this input (according to FileSplit)
for (PCollection<?> input : mscr.getInputs()) {
LazyCollection<?> thisL = (LazyCollection<?>) input;
if (thisL.getFile() == null) {
// Convention for intermediate results
thisL.setFile(tmpFolder + "/" + thisL.getPlumeId());
}
if (fS.getPath().toString().startsWith(thisL.getFile()) || fS.getPath().toString().startsWith("file:" + thisL.getFile())) {
l = thisL;
break;
}
}
if (l == null) {
throw new RuntimeException("Unable to match input split with any MSCR input");
}
// If this collection is a table -> process Pair, otherwise process value
PCollectionType type = l.getType();
Object toProcess = value;
if (type instanceof PTableType) {
toProcess = Pair.create(key, value);
}
for (DeferredOp op : l.getDownOps()) {
if (op instanceof MultipleParallelDo) {
MultipleParallelDo mPDo = ((MultipleParallelDo) op);
for (Object entry : mPDo.getDests().entrySet()) {
Map.Entry<PCollection, DoFn> en = (Map.Entry<PCollection, DoFn>) entry;
LazyCollection<?> lCol = (LazyCollection<?>) en.getKey();
DeferredOp childOp = null;
if (lCol.getDownOps() != null && lCol.getDownOps().size() > 0) {
childOp = lCol.getDownOps().get(0);
}
final Integer channel;
if (childOp != null && childOp instanceof Flatten) {
channel = mscr.getNumberedChannels().get(((Flatten) childOp).getDest());
} else if (childOp != null && childOp instanceof GroupByKey) {
channel = mscr.getNumberedChannels().get(((GroupByKey) childOp).getOrigin());
} else {
// bypass channel?
channel = mscr.getNumberedChannels().get(en.getKey());
}
if (channel == null) {
// This is not for this MSCR - just skip it
return;
}
// Call parallelDo function
en.getValue().process(toProcess, new EmitFn() {
@Override
public void emit(Object v) {
try {
if (v instanceof Pair) {
Pair p = (Pair) v;
context.write(new PlumeObject((WritableComparable) p.getKey(), channel), new PlumeObject((WritableComparable) p.getValue(), channel));
} else {
context.write(new PlumeObject((WritableComparable) v, channel), new PlumeObject((WritableComparable) v, channel));
}
} catch (Exception e) {
// TODO How to report this
e.printStackTrace();
}
}
});
}
} else {
if (op instanceof Flatten) {
l = (LazyCollection) ((Flatten) op).getDest();
}
int channel = mscr.getNumberedChannels().get(l);
if (toProcess instanceof Pair) {
context.write(new PlumeObject(key, channel), new PlumeObject(value, channel));
} else {
context.write(new PlumeObject(value, channel), new PlumeObject(value, channel));
}
}
}
}
use of com.tdunning.plume.local.lazy.op.GroupByKey in project Plume by tdunning.
the class MSCRMapper method map.
@SuppressWarnings("unchecked")
protected void map(WritableComparable key, WritableComparable value, final Mapper<WritableComparable, WritableComparable, PlumeObject, PlumeObject>.Context context) throws IOException, InterruptedException {
LazyCollection<?> l = null;
FileSplit fS = FileInputSplitWrapper.getFileInputSplit(context);
// Get LazyCollection for this input (according to FileSplit)
for (PCollection<?> input : mscr.getInputs()) {
LazyCollection<?> thisL = (LazyCollection<?>) input;
if (thisL.getFile() == null) {
// Convention for intermediate results
thisL.setFile(tmpFolder + "/" + thisL.getPlumeId());
}
if (fS.getPath().toString().startsWith(thisL.getFile()) || fS.getPath().toString().startsWith("file:" + thisL.getFile())) {
l = thisL;
break;
}
}
if (l == null) {
throw new RuntimeException("Unable to match input split with any MSCR input");
}
// If this collection is a table -> process Pair, otherwise process value
PCollectionType type = l.getType();
Object toProcess = value;
if (type instanceof PTableType) {
toProcess = Pair.create(key, value);
}
for (DeferredOp op : l.getDownOps()) {
if (op instanceof MultipleParallelDo) {
MultipleParallelDo mPDo = ((MultipleParallelDo) op);
for (Object entry : mPDo.getDests().entrySet()) {
Map.Entry<PCollection, DoFn> en = (Map.Entry<PCollection, DoFn>) entry;
LazyCollection<?> lCol = (LazyCollection<?>) en.getKey();
DeferredOp childOp = null;
if (lCol.getDownOps() != null && lCol.getDownOps().size() > 0) {
childOp = lCol.getDownOps().get(0);
}
final Integer channel;
if (childOp != null && childOp instanceof Flatten) {
channel = mscr.getNumberedChannels().get(((Flatten) childOp).getDest());
} else if (childOp != null && childOp instanceof GroupByKey) {
channel = mscr.getNumberedChannels().get(((GroupByKey) childOp).getOrigin());
} else {
// bypass channel?
channel = mscr.getNumberedChannels().get(en.getKey());
}
if (channel == null) {
// This is not for this MSCR - just skip it
return;
}
// Call parallelDo function
en.getValue().process(toProcess, new EmitFn() {
@Override
public void emit(Object v) {
try {
if (v instanceof Pair) {
Pair p = (Pair) v;
context.write(new PlumeObject((WritableComparable) p.getKey(), channel), new PlumeObject((WritableComparable) p.getValue(), channel));
} else {
context.write(new PlumeObject((WritableComparable) v, channel), new PlumeObject((WritableComparable) v, channel));
}
} catch (Exception e) {
// TODO How to report this
e.printStackTrace();
}
}
});
}
} else {
if (op instanceof Flatten) {
l = (LazyCollection) ((Flatten) op).getDest();
}
int channel = mscr.getNumberedChannels().get(l);
if (toProcess instanceof Pair) {
context.write(new PlumeObject(key, channel), new PlumeObject(value, channel));
} else {
context.write(new PlumeObject(value, channel), new PlumeObject(value, channel));
}
}
}
}
use of com.tdunning.plume.local.lazy.op.GroupByKey in project Plume by tdunning.
the class OptimizerTools method getMSCRBlocks.
/**
* This utility returns all the different MSCR blocks that can be created from this plan
*
* (pere) As of Oct/2010, I think this code can be simplified to be more like addRemainingTrivialMSCRs(), so a possible TODO would be
* to refactor it and make it more understandable. An opened question is whether there is an easy way of coding finding all possible
* MSCRs (including trivial, not related to GroupByKey operations ones) in a single and elegant loop.
*/
@SuppressWarnings({ "rawtypes", "unchecked" })
static Set<MSCR> getMSCRBlocks(List<PCollection> outputs) {
// Get all GroupByKeys from the tree
List<DeferredOp> groupBys = OptimizerTools.getAll(outputs, GroupByKey.class);
int mscrId = 1;
Set<MSCR> mscrs = new HashSet<MSCR>();
// For all found GroupByKey blocks
for (DeferredOp gBK : groupBys) {
GroupByKey groupBy = (GroupByKey<?, ?>) gBK;
// Gather all information needed for MSCR from this GBK
Set<PCollection<?>> inputs = new HashSet<PCollection<?>>();
Set<GroupByKey<?, ?>> outputChannels = new HashSet<GroupByKey<?, ?>>();
Set<Flatten<?>> unGroupedOutputChannels = new HashSet<Flatten<?>>();
Set<PCollection<?>> bypassChannels = new HashSet<PCollection<?>>();
Stack<LazyCollection<?>> toVisit = new Stack<LazyCollection<?>>();
Set<LazyCollection<?>> visited = new HashSet<LazyCollection<?>>();
LazyCollection<?> origin = (LazyCollection<?>) groupBy.getOrigin();
toVisit.push(origin);
outputChannels.add(groupBy);
while (!toVisit.isEmpty()) {
LazyCollection<?> current = toVisit.pop();
visited.add(current);
if (current.isMaterialized()) {
// condition for being a materialized input. This may change.
inputs.add(current);
continue;
}
DeferredOp op = current.getDeferredOp();
if (op instanceof MultipleParallelDo) {
// second condition for being an input
MultipleParallelDo<?> mPDo = (MultipleParallelDo) current.getDeferredOp();
if (((LazyCollection<?>) mPDo.getOrigin()).isMaterialized()) {
// will be done in Mapper
inputs.add(mPDo.getOrigin());
} else if (op instanceof ParallelDo) {
// will be done in Reducer
inputs.add(current);
} else {
// will be done in Mapper
inputs.add(mPDo.getOrigin());
}
// Check for bypass channels & output channels with no group-by
for (Map.Entry entry : mPDo.getDests().entrySet()) {
LazyCollection coll = (LazyCollection) entry.getKey();
if (coll.getDownOps() == null || coll.getDownOps().size() == 0) {
// leaf node
bypassChannels.add(coll);
} else if (coll.getDownOps().get(0) instanceof MultipleParallelDo) {
bypassChannels.add(coll);
/*
* Case of an output channel that Flattens with no Group By
*/
} else if (coll.getDownOps().get(0) instanceof Flatten) {
Flatten<?> thisFlatten = (Flatten<?>) coll.getDownOps().get(0);
LazyCollection ldest = (LazyCollection) thisFlatten.getDest();
if (ldest.getDownOps() == null || ldest.getDownOps().size() == 0 || ldest.getDownOps().get(0) instanceof MultipleParallelDo) {
unGroupedOutputChannels.add(thisFlatten);
// Add the rest of this flatten's origins to the stack in order to possibly discover more output channels
for (PCollection<?> col : thisFlatten.getOrigins()) {
if (!visited.contains(col)) {
toVisit.push((LazyCollection<?>) col);
}
}
}
}
}
continue;
}
if (op instanceof GroupByKey) {
// third condition for being an input - rare case when one GBK follows another
inputs.add(current);
continue;
}
if (op instanceof Flatten) {
Flatten<?> flatten = (Flatten<?>) op;
for (PCollection<?> input : flatten.getOrigins()) {
LazyCollection<?> in = (LazyCollection<?>) input;
if (!visited.contains(in)) {
toVisit.push(in);
}
}
continue;
}
if (op instanceof OneToOneOp) {
LazyCollection<?> input = (LazyCollection<?>) ((OneToOneOp<?, ?>) op).getOrigin();
if (!visited.contains(input)) {
toVisit.push(input);
}
continue;
}
}
MSCR mscrToAdd = null;
// Check if there is already one MSCR with at least one of this inputs
for (MSCR mscr : mscrs) {
for (PCollection<?> input : inputs) {
if (mscr.hasInput(input)) {
mscrToAdd = mscr;
break;
}
}
}
if (mscrToAdd == null) {
// otherwise create new MSCR
mscrToAdd = new MSCR(mscrId);
mscrId++;
}
// Add all missing input channels to current MSCR
for (PCollection<?> input : inputs) {
if (!mscrToAdd.hasInput(input)) {
mscrToAdd.addInput(input);
}
}
// Add all missing bypass outputs to current MSCR
for (PCollection<?> col : bypassChannels) {
if (!mscrToAdd.hasOutputChannel(col)) {
// Create new by-pass channel
MSCR.OutputChannel oC = new MSCR.OutputChannel(col);
mscrToAdd.addOutputChannel(oC);
}
}
// Add all missing flatten-with-no-groupby outputs to current MSCR
for (Flatten flatten : unGroupedOutputChannels) {
if (!mscrToAdd.hasOutputChannel(flatten.getDest())) {
// Create new channel with flatten and nothing else
MSCR.OutputChannel oC = new MSCR.OutputChannel(flatten.getDest());
oC.output = flatten.getDest();
oC.flatten = flatten;
mscrToAdd.addOutputChannel(oC);
}
}
// Add all missing output channels to current MSCR
for (GroupByKey groupByKey : outputChannels) {
if (!mscrToAdd.hasOutputChannel(groupByKey.getOrigin())) {
// Create new channel with group by key. It might have combiner and reducer as well.
MSCR.OutputChannel oC = new MSCR.OutputChannel(groupByKey);
oC.output = groupByKey.getDest();
if (groupByKey.getOrigin().getDeferredOp() instanceof Flatten) {
oC.flatten = (Flatten) groupByKey.getOrigin().getDeferredOp();
}
if (groupByKey.getDest().getDownOps() != null && groupByKey.getDest().getDownOps().size() == 1) {
DeferredOp op = (DeferredOp) groupByKey.getDest().getDownOps().get(0);
if (op instanceof CombineValues) {
oC.combiner = (CombineValues) op;
oC.output = oC.combiner.getDest();
LazyCollection dest = (LazyCollection) oC.combiner.getDest();
if (dest.getDownOps() != null && dest.getDownOps().size() == 1) {
op = (DeferredOp) dest.getDownOps().get(0);
}
}
if (op instanceof ParallelDo) {
oC.reducer = (ParallelDo) op;
oC.output = oC.reducer.getDest();
}
}
mscrToAdd.addOutputChannel(oC);
}
}
// Add if needed
mscrs.add(mscrToAdd);
}
return addRemainingTrivialMSCRs(outputs, mscrId, mscrs);
}
use of com.tdunning.plume.local.lazy.op.GroupByKey in project Plume by tdunning.
the class LocalExecutor method execute.
/**
* Execute one-output flow
*
* @param <T>
* @param output
* @return
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
public <T> Iterable<T> execute(LazyCollection<T> output) {
if (output.isMaterialized()) {
// nothing else to execute
return output.getData();
} else {
DeferredOp op = output.getDeferredOp();
final List<T> result = Lists.newArrayList();
// Flatten op
if (op instanceof Flatten) {
Flatten<T> flatten = (Flatten<T>) op;
for (PCollection<T> col : flatten.getOrigins()) {
Iterable<T> res = execute((LazyCollection<T>) col);
result.addAll(Lists.newArrayList(res));
}
// done with it
return result;
}
Iterable parent;
EmitFn<T> emitter = new EmitFn<T>() {
@Override
public void emit(T v) {
result.add(v);
}
};
// ParallelDo
if (op instanceof ParallelDo) {
ParallelDo pDo = (ParallelDo) op;
parent = execute((LazyCollection) pDo.getOrigin());
for (Object obj : parent) {
pDo.getFunction().process(obj, emitter);
}
// MultipleParallelDo -> parallel operations that read the same collection
// In this version of executor, we will only compute the current collection, not its neighbors
} else if (op instanceof MultipleParallelDo) {
MultipleParallelDo mPDo = (MultipleParallelDo) op;
parent = execute((LazyCollection) mPDo.getOrigin());
// get the function that corresponds to this collection
DoFn function = (DoFn) mPDo.getDests().get(output);
for (Object obj : parent) {
function.process(obj, emitter);
}
// GroupByKey
} else if (op instanceof GroupByKey) {
GroupByKey gBK = (GroupByKey) op;
parent = execute(gBK.getOrigin());
Map<Object, List> groupMap = Maps.newHashMap();
// Perform in-memory group by operation
for (Object obj : parent) {
Pair p = (Pair) obj;
List list = groupMap.get(p.getKey());
if (list == null) {
list = new ArrayList();
}
list.add(p.getValue());
groupMap.put(p.getKey(), list);
}
for (Map.Entry<Object, List> entry : groupMap.entrySet()) {
result.add((T) new Pair(entry.getKey(), entry.getValue()));
}
}
return result;
}
}
use of com.tdunning.plume.local.lazy.op.GroupByKey in project Plume by tdunning.
the class Optimizer method fuseParallelDos.
/**
* Fuse producer-consumer ParallelDos as in : {Orig2 => p2 => Orig1 => p1 => Output} to {Orig2 => p1(p2) => Output}
* @param arg The collection that may have compositions internally.
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
<T> void fuseParallelDos(PCollection<T> arg) {
LazyCollection<T> output = (LazyCollection<T>) arg;
if (output.isMaterialized()) {
// stop condition for recursive algorithm
return;
}
DeferredOp dOp = output.getDeferredOp();
if (!(dOp instanceof ParallelDo)) {
// not a ParallelDo
if (dOp instanceof OneToOneOp) {
// Recursively apply this function to parent
fuseParallelDos(((OneToOneOp) dOp).getOrigin());
return;
}
if (dOp instanceof Flatten) {
Flatten<T> flatten = (Flatten) dOp;
// Recursively apply this function to all parents
for (PCollection<T> col : flatten.getOrigins()) {
fuseParallelDos(col);
}
return;
}
}
ParallelDo p1 = (ParallelDo) output.getDeferredOp();
LazyCollection orig1 = (LazyCollection) p1.getOrigin();
if (orig1.isMaterialized()) {
return;
}
if (!(orig1.getDeferredOp() instanceof ParallelDo)) {
// Recursively apply this function to parent node
fuseParallelDos(orig1);
return;
}
// At this point we know ParallelDo fusion can be done -> Perform it
ParallelDo p2 = (ParallelDo) orig1.getDeferredOp();
// Lift combine values
if (p2 instanceof CombineValues) {
LazyCollection lCol = (LazyCollection) p2.getOrigin();
if (!lCol.isMaterialized() && lCol.getDeferredOp() instanceof GroupByKey) {
// Upper parallel do is CombineValues and follows a GroupByKey -> don't join
fuseParallelDos(orig1);
return;
}
}
final DoFn f1 = p1.getFunction();
final DoFn f2 = p2.getFunction();
// Define the joined function
DoFn newFn = new DoFn() {
@Override
public void process(Object v, final EmitFn emitter) {
f2.process(v, new EmitFn() {
@Override
public void emit(Object v) {
f1.process(v, emitter);
}
});
}
};
LazyCollection orig2 = (LazyCollection) p2.getOrigin();
ParallelDo newPDo = new ParallelDo(newFn, orig2, output);
// Clean & change pointers
orig2.downOps.remove(p2);
orig1.downOps.remove(p1);
orig2.addDownOp(newPDo);
output.deferredOp = newPDo;
// Recursively apply this function to the same node => TODO Beware infinite recursion, properly test
fuseParallelDos(output);
}
Aggregations