use of org.apache.crunch.Pair in project crunch by cloudera.
the class Aggregate method min.
/**
* Returns the smallest numerical element from the input collection.
*/
public static <S> PCollection<S> min(PCollection<S> collect) {
Class<S> clazz = collect.getPType().getTypeClass();
if (!clazz.isPrimitive() && !Comparable.class.isAssignableFrom(clazz)) {
throw new IllegalArgumentException("Can only get min for Comparable elements, not for: " + collect.getPType().getTypeClass());
}
PTypeFamily tf = collect.getTypeFamily();
return PTables.values(collect.parallelDo("min", new DoFn<S, Pair<Boolean, S>>() {
private transient S min = null;
public void process(S input, Emitter<Pair<Boolean, S>> emitter) {
if (min == null || ((Comparable<S>) min).compareTo(input) > 0) {
min = input;
}
}
public void cleanup(Emitter<Pair<Boolean, S>> emitter) {
if (min != null) {
emitter.emit(Pair.of(false, min));
}
}
}, tf.tableOf(tf.booleans(), collect.getPType())).groupByKey().combineValues(new CombineFn<Boolean, S>() {
public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
S min = null;
for (S v : input.second()) {
if (min == null || ((Comparable<S>) min).compareTo(v) > 0) {
min = v;
}
}
emitter.emit(Pair.of(input.first(), min));
}
}));
}
use of org.apache.crunch.Pair in project crunch by cloudera.
the class Cartesian method cross.
/**
* Performs a full cross join on the specified {@link PTable}s (using the same strategy as Pig's CROSS operator).
*
* @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Cross_join">Cross Join</a>
* @param left A PTable to perform a cross join on.
* @param right A PTable to perform a cross join on.
* @param parallelism The square root of the number of reducers to use. Increasing parallelism also increases copied data.
* @param <K1> Type of left PTable's keys.
* @param <K2> Type of right PTable's keys.
* @param <U> Type of the first {@link PTable}'s values
* @param <V> Type of the second {@link PTable}'s values
* @return The joined result as tuples of ((K1,K2), (U,V)).
*/
public static <K1, K2, U, V> PTable<Pair<K1, K2>, Pair<U, V>> cross(PTable<K1, U> left, PTable<K2, V> right, int parallelism) {
/* The strategy here is to simply emulate the following PigLatin:
* A = foreach table1 generate flatten(GFCross(0, 2)), flatten(*);
* B = foreach table2 generate flatten(GFCross(1, 2)), flatten(*);
* C = cogroup A by ($0, $1), B by ($0, $1);
* result = foreach C generate flatten(A), flatten(B);
*/
PTypeFamily ltf = left.getTypeFamily();
PTypeFamily rtf = right.getTypeFamily();
PTable<Pair<Integer, Integer>, Pair<K1, U>> leftCross = left.parallelDo(new GFCross<Pair<K1, U>>(0, parallelism), ltf.tableOf(ltf.pairs(ltf.ints(), ltf.ints()), ltf.pairs(left.getKeyType(), left.getValueType())));
PTable<Pair<Integer, Integer>, Pair<K2, V>> rightCross = right.parallelDo(new GFCross<Pair<K2, V>>(1, parallelism), rtf.tableOf(rtf.pairs(rtf.ints(), rtf.ints()), rtf.pairs(right.getKeyType(), right.getValueType())));
PTable<Pair<Integer, Integer>, Pair<Collection<Pair<K1, U>>, Collection<Pair<K2, V>>>> cg = leftCross.cogroup(rightCross);
PTypeFamily ctf = cg.getTypeFamily();
return cg.parallelDo(new DoFn<Pair<Pair<Integer, Integer>, Pair<Collection<Pair<K1, U>>, Collection<Pair<K2, V>>>>, Pair<Pair<K1, K2>, Pair<U, V>>>() {
@Override
public void process(Pair<Pair<Integer, Integer>, Pair<Collection<Pair<K1, U>>, Collection<Pair<K2, V>>>> input, Emitter<Pair<Pair<K1, K2>, Pair<U, V>>> emitter) {
for (Pair<K1, U> l : input.second().first()) {
for (Pair<K2, V> r : input.second().second()) {
emitter.emit(Pair.of(Pair.of(l.first(), r.first()), Pair.of(l.second(), r.second())));
}
}
}
}, ctf.tableOf(ctf.pairs(left.getKeyType(), right.getKeyType()), ctf.pairs(left.getValueType(), right.getValueType())));
}
use of org.apache.crunch.Pair in project crunch by cloudera.
the class Cogroup method cogroup.
/**
* Co-groups the two {@link PTable} arguments.
*
* @return a {@code PTable} representing the co-grouped tables.
*/
public static <K, U, V> PTable<K, Pair<Collection<U>, Collection<V>>> cogroup(PTable<K, U> left, PTable<K, V> right) {
PTypeFamily ptf = left.getTypeFamily();
PType<K> keyType = left.getPTableType().getKeyType();
PType<U> leftType = left.getPTableType().getValueType();
PType<V> rightType = right.getPTableType().getValueType();
PType<Pair<U, V>> itype = ptf.pairs(leftType, rightType);
PTable<K, Pair<U, V>> cgLeft = left.parallelDo("coGroupTag1", new CogroupFn1<K, U, V>(), ptf.tableOf(keyType, itype));
PTable<K, Pair<U, V>> cgRight = right.parallelDo("coGroupTag2", new CogroupFn2<K, U, V>(), ptf.tableOf(keyType, itype));
PTable<K, Pair<U, V>> both = cgLeft.union(cgRight);
PType<Pair<Collection<U>, Collection<V>>> otype = ptf.pairs(ptf.collections(leftType), ptf.collections(rightType));
return both.groupByKey().parallelDo("cogroup", new PostGroupFn<K, U, V>(), ptf.tableOf(keyType, otype));
}
use of org.apache.crunch.Pair in project crunch by cloudera.
the class AverageBytesByIP method run.
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.err.println();
System.err.println("Two and only two arguments are accepted.");
System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output");
System.err.println();
GenericOptionsParser.printGenericCommandUsage(System.err);
return 1;
}
// Create an object to coordinate pipeline creation and execution.
Pipeline pipeline = new MRPipeline(AverageBytesByIP.class, getConf());
// Reference a given text file as a collection of Strings.
PCollection<String> lines = pipeline.readTextFile(args[0]);
// Combiner used for summing up response size and count
CombineFn<String, Pair<Long, Long>> stringPairOfLongsSumCombiner = CombineFn.pairAggregator(CombineFn.SUM_LONGS, CombineFn.SUM_LONGS);
// Table of (ip, sum(response size), count)
PTable<String, Pair<Long, Long>> remoteAddrResponseSize = lines.parallelDo(extractResponseSize, Writables.tableOf(Writables.strings(), Writables.pairs(Writables.longs(), Writables.longs()))).groupByKey().combineValues(stringPairOfLongsSumCombiner);
// Calculate average response size by ip address
PTable<String, Double> avgs = remoteAddrResponseSize.parallelDo(calulateAverage, Writables.tableOf(Writables.strings(), Writables.doubles()));
// write the result to a text file
pipeline.writeTextFile(avgs, args[1]);
// Execute the pipeline as a MapReduce.
pipeline.done();
return 0;
}
use of org.apache.crunch.Pair in project crunch by cloudera.
the class MultiAvroSchemaJoinTest method testJoin.
@Test
public void testJoin() throws Exception {
Pipeline p = new MRPipeline(MultiAvroSchemaJoinTest.class);
PCollection<Person> people = p.read(From.avroFile(personFile.getAbsolutePath(), records(Person.class)));
PCollection<Employee> employees = p.read(From.avroFile(employeeFile.getAbsolutePath(), records(Employee.class)));
Iterable<Pair<Person, Employee>> result = people.by(new NameFn<Person>(), strings()).join(employees.by(new NameFn<Employee>(), strings())).values().materialize();
List<Pair<Person, Employee>> v = Lists.newArrayList(result);
assertEquals(1, v.size());
assertEquals("Kate", v.get(0).first().getName().toString());
assertEquals("Kate", v.get(0).second().getName().toString());
}
Aggregations