use of org.apache.crunch.Pair in project crunch by cloudera.
the class AvrosTest method testTableOf.
@Test
@SuppressWarnings("rawtypes")
public void testTableOf() throws Exception {
AvroType at = Avros.tableOf(Avros.strings(), Avros.strings());
Pair<String, String> j = Pair.of("a", "b");
org.apache.avro.mapred.Pair w = new org.apache.avro.mapred.Pair(at.getSchema());
w.put(0, new Utf8("a"));
w.put(1, new Utf8("b"));
// TODO update this after resolving the o.a.a.m.Pair.equals issue
initialize(at);
assertEquals(j, at.getInputMapFn().map(w));
org.apache.avro.mapred.Pair converted = (org.apache.avro.mapred.Pair) at.getOutputMapFn().map(j);
assertEquals(w.key(), converted.key());
assertEquals(w.value(), converted.value());
}
use of org.apache.crunch.Pair in project crunch by cloudera.
the class Aggregate method top.
public static <K, V> PTable<K, V> top(PTable<K, V> ptable, int limit, boolean maximize) {
PTypeFamily ptf = ptable.getTypeFamily();
PTableType<K, V> base = ptable.getPTableType();
PType<Pair<K, V>> pairType = ptf.pairs(base.getKeyType(), base.getValueType());
PTableType<Integer, Pair<K, V>> inter = ptf.tableOf(ptf.ints(), pairType);
return ptable.parallelDo("top" + limit + "map", new TopKFn<K, V>(limit, maximize), inter).groupByKey(1).combineValues(new TopKCombineFn<K, V>(limit, maximize)).parallelDo("top" + limit + "reduce", new DoFn<Pair<Integer, Pair<K, V>>, Pair<K, V>>() {
public void process(Pair<Integer, Pair<K, V>> input, Emitter<Pair<K, V>> emitter) {
emitter.emit(input.second());
}
}, base);
}
use of org.apache.crunch.Pair in project crunch by cloudera.
the class Aggregate method max.
/**
* Returns the largest numerical element from the input collection.
*/
public static <S> PCollection<S> max(PCollection<S> collect) {
Class<S> clazz = collect.getPType().getTypeClass();
if (!clazz.isPrimitive() && !Comparable.class.isAssignableFrom(clazz)) {
throw new IllegalArgumentException("Can only get max for Comparable elements, not for: " + collect.getPType().getTypeClass());
}
PTypeFamily tf = collect.getTypeFamily();
return PTables.values(collect.parallelDo("max", new DoFn<S, Pair<Boolean, S>>() {
private transient S max = null;
public void process(S input, Emitter<Pair<Boolean, S>> emitter) {
if (max == null || ((Comparable<S>) max).compareTo(input) < 0) {
max = input;
}
}
public void cleanup(Emitter<Pair<Boolean, S>> emitter) {
if (max != null) {
emitter.emit(Pair.of(true, max));
}
}
}, tf.tableOf(tf.booleans(), collect.getPType())).groupByKey(1).combineValues(new CombineFn<Boolean, S>() {
public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
S max = null;
for (S v : input.second()) {
if (max == null || ((Comparable<S>) max).compareTo(v) < 0) {
max = v;
}
}
emitter.emit(Pair.of(input.first(), max));
}
}));
}
use of org.apache.crunch.Pair in project crunch by cloudera.
the class Cartesian method cross.
/**
* Performs a full cross join on the specified {@link PCollection}s (using the same strategy as Pig's CROSS operator).
*
* @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Cross_join">Cross Join</a>
* @param left A PCollection to perform a cross join on.
* @param right A PCollection to perform a cross join on.
* @param <U> Type of the first {@link PCollection}'s values
* @param <V> Type of the second {@link PCollection}'s values
* @return The joined result as tuples of (U,V).
*/
public static <U, V> PCollection<Pair<U, V>> cross(PCollection<U> left, PCollection<V> right, int parallelism) {
PTypeFamily ltf = left.getTypeFamily();
PTypeFamily rtf = right.getTypeFamily();
PTableType<Pair<Integer, Integer>, U> ptt = ltf.tableOf(ltf.pairs(ltf.ints(), ltf.ints()), left.getPType());
if (ptt == null)
throw new Error();
PTable<Pair<Integer, Integer>, U> leftCross = left.parallelDo(new GFCross<U>(0, parallelism), ltf.tableOf(ltf.pairs(ltf.ints(), ltf.ints()), left.getPType()));
PTable<Pair<Integer, Integer>, V> rightCross = right.parallelDo(new GFCross<V>(1, parallelism), rtf.tableOf(rtf.pairs(rtf.ints(), rtf.ints()), right.getPType()));
PTable<Pair<Integer, Integer>, Pair<Collection<U>, Collection<V>>> cg = leftCross.cogroup(rightCross);
PTypeFamily ctf = cg.getTypeFamily();
return cg.parallelDo(new DoFn<Pair<Pair<Integer, Integer>, Pair<Collection<U>, Collection<V>>>, Pair<U, V>>() {
@Override
public void process(Pair<Pair<Integer, Integer>, Pair<Collection<U>, Collection<V>>> input, Emitter<Pair<U, V>> emitter) {
for (U l : input.second().first()) {
for (V r : input.second().second()) {
emitter.emit(Pair.of(l, r));
}
}
}
}, ctf.pairs(left.getPType(), right.getPType()));
}
use of org.apache.crunch.Pair in project crunch by cloudera.
the class Join method join.
public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right, JoinFn<K, U, V> joinFn) {
PTypeFamily ptf = left.getTypeFamily();
PGroupedTable<Pair<K, Integer>, Pair<U, V>> grouped = preJoin(left, right);
PTableType<K, Pair<U, V>> ret = ptf.tableOf(left.getKeyType(), ptf.pairs(left.getValueType(), right.getValueType()));
return grouped.parallelDo(joinFn.getJoinType() + grouped.getName(), joinFn, ret);
}
Aggregations