use of org.apache.crunch.PCollection in project crunch by cloudera.
the class Cartesian method cross.
/**
* Performs a full cross join on the specified {@link PCollection}s (using the same strategy as Pig's CROSS operator).
*
* @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Cross_join">Cross Join</a>
* @param left A PCollection to perform a cross join on.
* @param right A PCollection to perform a cross join on.
* @param <U> Type of the first {@link PCollection}'s values
* @param <V> Type of the second {@link PCollection}'s values
* @return The joined result as tuples of (U,V).
*/
public static <U, V> PCollection<Pair<U, V>> cross(PCollection<U> left, PCollection<V> right, int parallelism) {
PTypeFamily ltf = left.getTypeFamily();
PTypeFamily rtf = right.getTypeFamily();
PTableType<Pair<Integer, Integer>, U> ptt = ltf.tableOf(ltf.pairs(ltf.ints(), ltf.ints()), left.getPType());
if (ptt == null)
throw new Error();
PTable<Pair<Integer, Integer>, U> leftCross = left.parallelDo(new GFCross<U>(0, parallelism), ltf.tableOf(ltf.pairs(ltf.ints(), ltf.ints()), left.getPType()));
PTable<Pair<Integer, Integer>, V> rightCross = right.parallelDo(new GFCross<V>(1, parallelism), rtf.tableOf(rtf.pairs(rtf.ints(), rtf.ints()), right.getPType()));
PTable<Pair<Integer, Integer>, Pair<Collection<U>, Collection<V>>> cg = leftCross.cogroup(rightCross);
PTypeFamily ctf = cg.getTypeFamily();
return cg.parallelDo(new DoFn<Pair<Pair<Integer, Integer>, Pair<Collection<U>, Collection<V>>>, Pair<U, V>>() {
@Override
public void process(Pair<Pair<Integer, Integer>, Pair<Collection<U>, Collection<V>>> input, Emitter<Pair<U, V>> emitter) {
for (U l : input.second().first()) {
for (V r : input.second().second()) {
emitter.emit(Pair.of(l, r));
}
}
}
}, ctf.pairs(left.getPType(), right.getPType()));
}
use of org.apache.crunch.PCollection in project crunch by cloudera.
the class Set method comm.
/**
* Find the elements that are common to two sets, like the Unix <code>comm</code>
* utility. This method returns a {@link PCollection} of {@link Tuple3} objects,
* and the position in the tuple that an element appears is determined by
* the collections that it is a member of, as follows:
* <ol>
* <li>elements only in <code>coll1</code>,</li>
* <li>elements only in <code>coll2</code>, or</li>
* <li>elements in both collections</li>
* </ol>
* Tuples are otherwise filled with <code>null</code>.
*
* @return a collection of {@link Tuple3} objects
*/
public static <T> PCollection<Tuple3<T, T, T>> comm(PCollection<T> coll1, PCollection<T> coll2) {
PTypeFamily typeFamily = coll1.getTypeFamily();
PType<T> type = coll1.getPType();
return Cogroup.cogroup(toTable(coll1), toTable(coll2)).parallelDo(new DoFn<Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>>, Tuple3<T, T, T>>() {
@Override
public void process(Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>> input, Emitter<Tuple3<T, T, T>> emitter) {
Pair<Collection<Boolean>, Collection<Boolean>> groups = input.second();
boolean inFirst = !groups.first().isEmpty();
boolean inSecond = !groups.second().isEmpty();
T t = input.first();
emitter.emit(Tuple3.of(inFirst && !inSecond ? t : null, !inFirst && inSecond ? t : null, inFirst && inSecond ? t : null));
}
}, typeFamily.triples(type, type, type));
}
Aggregations