use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.
the class Set method comm.
/**
* Find the elements that are common to two sets, like the Unix <code>comm</code>
* utility. This method returns a {@link PCollection} of {@link Tuple3} objects,
* and the position in the tuple that an element appears is determined by
* the collections that it is a member of, as follows:
* <ol>
* <li>elements only in <code>coll1</code>,</li>
* <li>elements only in <code>coll2</code>, or</li>
* <li>elements in both collections</li>
* </ol>
* Tuples are otherwise filled with <code>null</code>.
*
* @return a collection of {@link Tuple3} objects
*/
public static <T> PCollection<Tuple3<T, T, T>> comm(PCollection<T> coll1, PCollection<T> coll2) {
PTypeFamily typeFamily = coll1.getTypeFamily();
PType<T> type = coll1.getPType();
return Cogroup.cogroup(toTable(coll1), toTable(coll2)).parallelDo(new DoFn<Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>>, Tuple3<T, T, T>>() {
@Override
public void process(Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>> input, Emitter<Tuple3<T, T, T>> emitter) {
Pair<Collection<Boolean>, Collection<Boolean>> groups = input.second();
boolean inFirst = !groups.first().isEmpty();
boolean inSecond = !groups.second().isEmpty();
T t = input.first();
emitter.emit(Tuple3.of(inFirst && !inSecond ? t : null, !inFirst && inSecond ? t : null, inFirst && inSecond ? t : null));
}
}, typeFamily.triples(type, type, type));
}
use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.
the class Sort method sortTuples.
/**
* Sorts the {@link PCollection} of {@link TupleN}s using the specified column
* ordering.
*
* @return a {@link PCollection} representing the sorted collection.
*/
public static PCollection<TupleN> sortTuples(PCollection<TupleN> collection, ColumnOrder... columnOrders) {
PTypeFamily tf = collection.getTypeFamily();
PType<TupleN> pType = collection.getPType();
PTableType<TupleN, Void> type = tf.tableOf(tf.tuples(pType.getSubTypes().toArray(new PType[0])), tf.nulls());
PTable<TupleN, Void> pt = collection.parallelDo(new DoFn<TupleN, Pair<TupleN, Void>>() {
@Override
public void process(TupleN input, Emitter<Pair<TupleN, Void>> emitter) {
emitter.emit(Pair.of(input, (Void) null));
}
}, type);
Configuration conf = collection.getPipeline().getConfiguration();
GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders);
PTable<TupleN, Void> sortedPt = pt.groupByKey(options).ungroup();
return sortedPt.parallelDo(new DoFn<Pair<TupleN, Void>, TupleN>() {
@Override
public void process(Pair<TupleN, Void> input, Emitter<TupleN> emitter) {
emitter.emit(input.first());
}
}, collection.getPType());
}
use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.
the class Sort method sort.
/**
* Sorts the {@link PCollection} using the natural ordering of its elements
* in the order specified.
*
* @return a {@link PCollection} representing the sorted collection.
*/
public static <T> PCollection<T> sort(PCollection<T> collection, Order order) {
PTypeFamily tf = collection.getTypeFamily();
PTableType<T, Void> type = tf.tableOf(collection.getPType(), tf.nulls());
Configuration conf = collection.getPipeline().getConfiguration();
GroupingOptions options = buildGroupingOptions(conf, tf, collection.getPType(), order);
PTable<T, Void> pt = collection.parallelDo("sort-pre", new DoFn<T, Pair<T, Void>>() {
@Override
public void process(T input, Emitter<Pair<T, Void>> emitter) {
emitter.emit(Pair.of(input, (Void) null));
}
}, type);
PTable<T, Void> sortedPt = pt.groupByKey(options).ungroup();
return sortedPt.parallelDo("sort-post", new DoFn<Pair<T, Void>, T>() {
@Override
public void process(Pair<T, Void> input, Emitter<T> emitter) {
emitter.emit(input.first());
}
}, collection.getPType());
}
use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.
the class Sort method sortPairs.
/**
* Sorts the {@link PCollection} of {@link Pair}s using the specified column
* ordering.
*
* @return a {@link PCollection} representing the sorted collection.
*/
public static <U, V> PCollection<Pair<U, V>> sortPairs(PCollection<Pair<U, V>> collection, ColumnOrder... columnOrders) {
// put U and V into a pair/tuple in the key so we can do grouping and sorting
PTypeFamily tf = collection.getTypeFamily();
PType<Pair<U, V>> pType = collection.getPType();
@SuppressWarnings("unchecked") PTableType<Pair<U, V>, Void> type = tf.tableOf(tf.pairs(pType.getSubTypes().get(0), pType.getSubTypes().get(1)), tf.nulls());
PTable<Pair<U, V>, Void> pt = collection.parallelDo(new DoFn<Pair<U, V>, Pair<Pair<U, V>, Void>>() {
@Override
public void process(Pair<U, V> input, Emitter<Pair<Pair<U, V>, Void>> emitter) {
emitter.emit(Pair.of(input, (Void) null));
}
}, type);
Configuration conf = collection.getPipeline().getConfiguration();
GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders);
PTable<Pair<U, V>, Void> sortedPt = pt.groupByKey(options).ungroup();
return sortedPt.parallelDo(new DoFn<Pair<Pair<U, V>, Void>, Pair<U, V>>() {
@Override
public void process(Pair<Pair<U, V>, Void> input, Emitter<Pair<U, V>> emitter) {
emitter.emit(input.first());
}
}, collection.getPType());
}
use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.
the class Sort method sortTriples.
/**
* Sorts the {@link PCollection} of {@link Tuple3}s using the specified column
* ordering.
*
* @return a {@link PCollection} representing the sorted collection.
*/
public static <V1, V2, V3> PCollection<Tuple3<V1, V2, V3>> sortTriples(PCollection<Tuple3<V1, V2, V3>> collection, ColumnOrder... columnOrders) {
PTypeFamily tf = collection.getTypeFamily();
PType<Tuple3<V1, V2, V3>> pType = collection.getPType();
@SuppressWarnings("unchecked") PTableType<Tuple3<V1, V2, V3>, Void> type = tf.tableOf(tf.triples(pType.getSubTypes().get(0), pType.getSubTypes().get(1), pType.getSubTypes().get(2)), tf.nulls());
PTable<Tuple3<V1, V2, V3>, Void> pt = collection.parallelDo(new DoFn<Tuple3<V1, V2, V3>, Pair<Tuple3<V1, V2, V3>, Void>>() {
@Override
public void process(Tuple3<V1, V2, V3> input, Emitter<Pair<Tuple3<V1, V2, V3>, Void>> emitter) {
emitter.emit(Pair.of(input, (Void) null));
}
}, type);
Configuration conf = collection.getPipeline().getConfiguration();
GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders);
PTable<Tuple3<V1, V2, V3>, Void> sortedPt = pt.groupByKey(options).ungroup();
return sortedPt.parallelDo(new DoFn<Pair<Tuple3<V1, V2, V3>, Void>, Tuple3<V1, V2, V3>>() {
@Override
public void process(Pair<Tuple3<V1, V2, V3>, Void> input, Emitter<Tuple3<V1, V2, V3>> emitter) {
emitter.emit(input.first());
}
}, collection.getPType());
}
Aggregations