use of com.tdunning.plume.local.lazy.op.Flatten in project Plume by tdunning.
the class Optimizer method sinkFlattens.
/**
* Sink flattens pushing them down to create opportunities for ParallelDo fusion
* @param arg The collection that may contain flatten operations that we need to sink.
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
<T> void sinkFlattens(PCollection<T> arg) {
LazyCollection<T> output = (LazyCollection<T>) arg;
if (output.isMaterialized()) {
// stop condition for recursive algorithm
return;
}
DeferredOp dOp = output.getDeferredOp();
if (!(dOp instanceof Flatten)) {
if (dOp instanceof OneToOneOp) {
// Recursively apply this function to parent
sinkFlattens(((OneToOneOp) dOp).getOrigin());
return;
} else if (dOp instanceof ParallelDo) {
// Recursively apply this function to parent
sinkFlattens(((ParallelDo) dOp).getOrigin());
return;
}
}
if (output.getDownOps() == null || output.getDownOps().size() != 1) {
// Recursively apply this function to parent
for (Object col : ((Flatten) dOp).getOrigins()) {
sinkFlattens((PCollection) col);
}
return;
}
DeferredOp downOp = output.getDownOps().get(0);
if (!(downOp instanceof ParallelDo)) {
return;
}
// PDo below current node
ParallelDo<T, ?> op = (ParallelDo<T, ?>) downOp;
// Flatten above current node
Flatten<T> flatten = (Flatten<T>) dOp;
List<PCollection<?>> newOrigins = new ArrayList<PCollection<?>>();
// Iterate over all Flatten inputs
for (PCollection<T> col : flatten.getOrigins()) {
// Recursively apply this function to this flatten's origin
LazyCollection<T> fInput = (LazyCollection<T>) col;
sinkFlattens(fInput);
// Sink
LazyCollection<?> newInput = new LazyCollection();
newInput.deferredOp = new ParallelDo(op.getFunction(), fInput, newInput);
newInput.type = ((LazyCollection) flatten.getDest()).getType();
fInput.downOps.remove(0);
fInput.addDownOp(newInput.deferredOp);
newOrigins.add(newInput);
}
Flatten<?> newFlatten = new Flatten(newOrigins, op.getDest());
((LazyCollection<?>) op.getDest()).deferredOp = newFlatten;
for (PCollection<?> newOp : newOrigins) {
((LazyCollection<?>) newOp).addDownOp(newFlatten);
}
}
use of com.tdunning.plume.local.lazy.op.Flatten in project Plume by tdunning.
the class Optimizer method removeUnnecessaryOps.
/**
* Removes unnecesary operations that are not removed by the Optimizer. It goes top-down (receives an Input).
* Returns true if passed node doesn't lead to an output.
*/
boolean removeUnnecessaryOps(PCollection arg, List<PCollection> outputs) {
LazyCollection<?> input = (LazyCollection) arg;
if (input.getDownOps() == null || input.getDownOps().size() == 0) {
// Leaf node
return !outputs.contains(input);
}
// create new list of deferredops that are 'usefull'
List<DeferredOp> finalDOps = new ArrayList<DeferredOp>();
for (DeferredOp op : input.getDownOps()) {
boolean remove = false;
if (op instanceof OneToOneOp) {
remove = removeUnnecessaryOps(((OneToOneOp<?, ?>) op).getDest(), outputs);
} else if (op instanceof ParallelDo) {
remove = removeUnnecessaryOps(((ParallelDo<?, ?>) op).getDest(), outputs);
} else if (op instanceof Flatten) {
remove = removeUnnecessaryOps(((Flatten<?>) op).getDest(), outputs);
} else if (op instanceof MultipleParallelDo) {
MultipleParallelDo<?> mPDo = (MultipleParallelDo<?>) op;
// begin with 1 because we will apply an AND gate with the childs
remove = true;
for (Object entry : mPDo.getDests().entrySet()) {
PCollection<?> pCol = (PCollection<?>) ((Map.Entry) entry).getKey();
remove = remove & removeUnnecessaryOps(pCol, outputs);
}
}
if (!remove) {
finalDOps.add(op);
}
}
input.downOps = finalDOps;
// if true this node can also be removed
return finalDOps.size() == 0;
}
use of com.tdunning.plume.local.lazy.op.Flatten in project Plume by tdunning.
the class LocalExecutor method execute.
/**
* Execute one-output flow
*
* @param <T>
* @param output
* @return
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
public <T> Iterable<T> execute(LazyCollection<T> output) {
if (output.isMaterialized()) {
// nothing else to execute
return output.getData();
} else {
DeferredOp op = output.getDeferredOp();
final List<T> result = Lists.newArrayList();
// Flatten op
if (op instanceof Flatten) {
Flatten<T> flatten = (Flatten<T>) op;
for (PCollection<T> col : flatten.getOrigins()) {
Iterable<T> res = execute((LazyCollection<T>) col);
result.addAll(Lists.newArrayList(res));
}
// done with it
return result;
}
Iterable parent;
EmitFn<T> emitter = new EmitFn<T>() {
@Override
public void emit(T v) {
result.add(v);
}
};
// ParallelDo
if (op instanceof ParallelDo) {
ParallelDo pDo = (ParallelDo) op;
parent = execute((LazyCollection) pDo.getOrigin());
for (Object obj : parent) {
pDo.getFunction().process(obj, emitter);
}
// MultipleParallelDo -> parallel operations that read the same collection
// In this version of executor, we will only compute the current collection, not its neighbors
} else if (op instanceof MultipleParallelDo) {
MultipleParallelDo mPDo = (MultipleParallelDo) op;
parent = execute((LazyCollection) mPDo.getOrigin());
// get the function that corresponds to this collection
DoFn function = (DoFn) mPDo.getDests().get(output);
for (Object obj : parent) {
function.process(obj, emitter);
}
// GroupByKey
} else if (op instanceof GroupByKey) {
GroupByKey gBK = (GroupByKey) op;
parent = execute(gBK.getOrigin());
Map<Object, List> groupMap = Maps.newHashMap();
// Perform in-memory group by operation
for (Object obj : parent) {
Pair p = (Pair) obj;
List list = groupMap.get(p.getKey());
if (list == null) {
list = new ArrayList();
}
list.add(p.getValue());
groupMap.put(p.getKey(), list);
}
for (Map.Entry<Object, List> entry : groupMap.entrySet()) {
result.add((T) new Pair(entry.getKey(), entry.getValue()));
}
}
return result;
}
}
use of com.tdunning.plume.local.lazy.op.Flatten in project Plume by tdunning.
the class OptimizerTools method getAll.
static List<DeferredOp> getAll(PCollection<?> output, Class<? extends DeferredOp> getClass) {
List<DeferredOp> retOps = new ArrayList<DeferredOp>();
Stack<LazyCollection<?>> toVisit = new Stack<LazyCollection<?>>();
Set<LazyCollection<?>> visited = new HashSet<LazyCollection<?>>();
toVisit.push((LazyCollection<?>) output);
while (!toVisit.isEmpty()) {
LazyCollection<?> current = toVisit.pop();
visited.add(current);
if (current.isMaterialized()) {
continue;
}
DeferredOp op = current.getDeferredOp();
if (op.getClass().equals(getClass)) {
// Found
if (!retOps.contains(op)) {
retOps.add(op);
}
}
// Add more nodes to visit
List<DeferredOp> ops = Lists.newArrayList();
ops.add(op);
for (DeferredOp o : ops) {
if (o instanceof Flatten) {
for (PCollection<?> input : ((Flatten<?>) o).getOrigins()) {
LazyCollection<?> in = (LazyCollection<?>) input;
if (!visited.contains(in)) {
toVisit.push(in);
}
}
continue;
}
if (o instanceof OneToOneOp) {
LazyCollection<?> input = (LazyCollection<?>) ((OneToOneOp<?, ?>) o).getOrigin();
if (!visited.contains(input)) {
toVisit.push(input);
}
continue;
}
if (o instanceof MultipleParallelDo) {
MultipleParallelDo<?> mPDo = (MultipleParallelDo<?>) o;
LazyCollection<?> input = (LazyCollection<?>) mPDo.getOrigin();
if (!visited.contains(input)) {
toVisit.push(input);
}
}
}
}
return retOps;
}
use of com.tdunning.plume.local.lazy.op.Flatten in project Plume by tdunning.
the class Optimizer method fuseParallelDos.
/**
* Fuse producer-consumer ParallelDos as in : {Orig2 => p2 => Orig1 => p1 => Output} to {Orig2 => p1(p2) => Output}
* @param arg The collection that may have compositions internally.
*/
@SuppressWarnings({ "unchecked", "rawtypes" })
<T> void fuseParallelDos(PCollection<T> arg) {
LazyCollection<T> output = (LazyCollection<T>) arg;
if (output.isMaterialized()) {
// stop condition for recursive algorithm
return;
}
DeferredOp dOp = output.getDeferredOp();
if (!(dOp instanceof ParallelDo)) {
// not a ParallelDo
if (dOp instanceof OneToOneOp) {
// Recursively apply this function to parent
fuseParallelDos(((OneToOneOp) dOp).getOrigin());
return;
}
if (dOp instanceof Flatten) {
Flatten<T> flatten = (Flatten) dOp;
// Recursively apply this function to all parents
for (PCollection<T> col : flatten.getOrigins()) {
fuseParallelDos(col);
}
return;
}
}
ParallelDo p1 = (ParallelDo) output.getDeferredOp();
LazyCollection orig1 = (LazyCollection) p1.getOrigin();
if (orig1.isMaterialized()) {
return;
}
if (!(orig1.getDeferredOp() instanceof ParallelDo)) {
// Recursively apply this function to parent node
fuseParallelDos(orig1);
return;
}
// At this point we know ParallelDo fusion can be done -> Perform it
ParallelDo p2 = (ParallelDo) orig1.getDeferredOp();
// Lift combine values
if (p2 instanceof CombineValues) {
LazyCollection lCol = (LazyCollection) p2.getOrigin();
if (!lCol.isMaterialized() && lCol.getDeferredOp() instanceof GroupByKey) {
// Upper parallel do is CombineValues and follows a GroupByKey -> don't join
fuseParallelDos(orig1);
return;
}
}
final DoFn f1 = p1.getFunction();
final DoFn f2 = p2.getFunction();
// Define the joined function
DoFn newFn = new DoFn() {
@Override
public void process(Object v, final EmitFn emitter) {
f2.process(v, new EmitFn() {
@Override
public void emit(Object v) {
f1.process(v, emitter);
}
});
}
};
LazyCollection orig2 = (LazyCollection) p2.getOrigin();
ParallelDo newPDo = new ParallelDo(newFn, orig2, output);
// Clean & change pointers
orig2.downOps.remove(p2);
orig1.downOps.remove(p1);
orig2.addDownOp(newPDo);
output.deferredOp = newPDo;
// Recursively apply this function to the same node => TODO Beware infinite recursion, properly test
fuseParallelDos(output);
}
Aggregations