Search in sources :

Example 21 with PTypeFamily

use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.

the class Set method comm.

/**
 * Find the elements that are common to two sets, like the Unix <code>comm</code>
 * utility. This method returns a {@link PCollection} of {@link Tuple3} objects,
 * and the position in the tuple that an element appears is determined by
 * the collections that it is a member of, as follows:
 * <ol>
 * <li>elements only in <code>coll1</code>,</li>
 * <li>elements only in <code>coll2</code>, or</li>
 * <li>elements in both collections</li>
 * </ol>
 * Tuples are otherwise filled with <code>null</code>.
 *
 * @return a collection of {@link Tuple3} objects
 */
public static <T> PCollection<Tuple3<T, T, T>> comm(PCollection<T> coll1, PCollection<T> coll2) {
    PTypeFamily typeFamily = coll1.getTypeFamily();
    PType<T> type = coll1.getPType();
    return Cogroup.cogroup(toTable(coll1), toTable(coll2)).parallelDo(new DoFn<Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>>, Tuple3<T, T, T>>() {

        @Override
        public void process(Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>> input, Emitter<Tuple3<T, T, T>> emitter) {
            Pair<Collection<Boolean>, Collection<Boolean>> groups = input.second();
            boolean inFirst = !groups.first().isEmpty();
            boolean inSecond = !groups.second().isEmpty();
            T t = input.first();
            emitter.emit(Tuple3.of(inFirst && !inSecond ? t : null, !inFirst && inSecond ? t : null, inFirst && inSecond ? t : null));
        }
    }, typeFamily.triples(type, type, type));
}
Also used : PTypeFamily(org.apache.crunch.types.PTypeFamily) Tuple3(org.apache.crunch.Tuple3) Collection(java.util.Collection) PCollection(org.apache.crunch.PCollection) Pair(org.apache.crunch.Pair)

Example 22 with PTypeFamily

use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.

the class Sort method sortTuples.

/**
 * Sorts the {@link PCollection} of {@link TupleN}s using the specified column
 * ordering.
 *
 * @return a {@link PCollection} representing the sorted collection.
 */
public static PCollection<TupleN> sortTuples(PCollection<TupleN> collection, ColumnOrder... columnOrders) {
    PTypeFamily tf = collection.getTypeFamily();
    PType<TupleN> pType = collection.getPType();
    PTableType<TupleN, Void> type = tf.tableOf(tf.tuples(pType.getSubTypes().toArray(new PType[0])), tf.nulls());
    PTable<TupleN, Void> pt = collection.parallelDo(new DoFn<TupleN, Pair<TupleN, Void>>() {

        @Override
        public void process(TupleN input, Emitter<Pair<TupleN, Void>> emitter) {
            emitter.emit(Pair.of(input, (Void) null));
        }
    }, type);
    Configuration conf = collection.getPipeline().getConfiguration();
    GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders);
    PTable<TupleN, Void> sortedPt = pt.groupByKey(options).ungroup();
    return sortedPt.parallelDo(new DoFn<Pair<TupleN, Void>, TupleN>() {

        @Override
        public void process(Pair<TupleN, Void> input, Emitter<TupleN> emitter) {
            emitter.emit(input.first());
        }
    }, collection.getPType());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) PTypeFamily(org.apache.crunch.types.PTypeFamily) GroupingOptions(org.apache.crunch.GroupingOptions) TupleN(org.apache.crunch.TupleN) Pair(org.apache.crunch.Pair)

Example 23 with PTypeFamily

use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.

the class Sort method sort.

/**
 * Sorts the {@link PCollection} using the natural ordering of its elements
 * in the order specified.
 *
 * @return a {@link PCollection} representing the sorted collection.
 */
public static <T> PCollection<T> sort(PCollection<T> collection, Order order) {
    PTypeFamily tf = collection.getTypeFamily();
    PTableType<T, Void> type = tf.tableOf(collection.getPType(), tf.nulls());
    Configuration conf = collection.getPipeline().getConfiguration();
    GroupingOptions options = buildGroupingOptions(conf, tf, collection.getPType(), order);
    PTable<T, Void> pt = collection.parallelDo("sort-pre", new DoFn<T, Pair<T, Void>>() {

        @Override
        public void process(T input, Emitter<Pair<T, Void>> emitter) {
            emitter.emit(Pair.of(input, (Void) null));
        }
    }, type);
    PTable<T, Void> sortedPt = pt.groupByKey(options).ungroup();
    return sortedPt.parallelDo("sort-post", new DoFn<Pair<T, Void>, T>() {

        @Override
        public void process(Pair<T, Void> input, Emitter<T> emitter) {
            emitter.emit(input.first());
        }
    }, collection.getPType());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) PTypeFamily(org.apache.crunch.types.PTypeFamily) GroupingOptions(org.apache.crunch.GroupingOptions) Pair(org.apache.crunch.Pair)

Example 24 with PTypeFamily

use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.

the class Sort method sortPairs.

/**
 * Sorts the {@link PCollection} of {@link Pair}s using the specified column
 * ordering.
 *
 * @return a {@link PCollection} representing the sorted collection.
 */
public static <U, V> PCollection<Pair<U, V>> sortPairs(PCollection<Pair<U, V>> collection, ColumnOrder... columnOrders) {
    // put U and V into a pair/tuple in the key so we can do grouping and sorting
    PTypeFamily tf = collection.getTypeFamily();
    PType<Pair<U, V>> pType = collection.getPType();
    @SuppressWarnings("unchecked") PTableType<Pair<U, V>, Void> type = tf.tableOf(tf.pairs(pType.getSubTypes().get(0), pType.getSubTypes().get(1)), tf.nulls());
    PTable<Pair<U, V>, Void> pt = collection.parallelDo(new DoFn<Pair<U, V>, Pair<Pair<U, V>, Void>>() {

        @Override
        public void process(Pair<U, V> input, Emitter<Pair<Pair<U, V>, Void>> emitter) {
            emitter.emit(Pair.of(input, (Void) null));
        }
    }, type);
    Configuration conf = collection.getPipeline().getConfiguration();
    GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders);
    PTable<Pair<U, V>, Void> sortedPt = pt.groupByKey(options).ungroup();
    return sortedPt.parallelDo(new DoFn<Pair<Pair<U, V>, Void>, Pair<U, V>>() {

        @Override
        public void process(Pair<Pair<U, V>, Void> input, Emitter<Pair<U, V>> emitter) {
            emitter.emit(input.first());
        }
    }, collection.getPType());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) PTypeFamily(org.apache.crunch.types.PTypeFamily) GroupingOptions(org.apache.crunch.GroupingOptions) Pair(org.apache.crunch.Pair)

Example 25 with PTypeFamily

use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.

the class Sort method sortTriples.

/**
 * Sorts the {@link PCollection} of {@link Tuple3}s using the specified column
 * ordering.
 *
 * @return a {@link PCollection} representing the sorted collection.
 */
public static <V1, V2, V3> PCollection<Tuple3<V1, V2, V3>> sortTriples(PCollection<Tuple3<V1, V2, V3>> collection, ColumnOrder... columnOrders) {
    PTypeFamily tf = collection.getTypeFamily();
    PType<Tuple3<V1, V2, V3>> pType = collection.getPType();
    @SuppressWarnings("unchecked") PTableType<Tuple3<V1, V2, V3>, Void> type = tf.tableOf(tf.triples(pType.getSubTypes().get(0), pType.getSubTypes().get(1), pType.getSubTypes().get(2)), tf.nulls());
    PTable<Tuple3<V1, V2, V3>, Void> pt = collection.parallelDo(new DoFn<Tuple3<V1, V2, V3>, Pair<Tuple3<V1, V2, V3>, Void>>() {

        @Override
        public void process(Tuple3<V1, V2, V3> input, Emitter<Pair<Tuple3<V1, V2, V3>, Void>> emitter) {
            emitter.emit(Pair.of(input, (Void) null));
        }
    }, type);
    Configuration conf = collection.getPipeline().getConfiguration();
    GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders);
    PTable<Tuple3<V1, V2, V3>, Void> sortedPt = pt.groupByKey(options).ungroup();
    return sortedPt.parallelDo(new DoFn<Pair<Tuple3<V1, V2, V3>, Void>, Tuple3<V1, V2, V3>>() {

        @Override
        public void process(Pair<Tuple3<V1, V2, V3>, Void> input, Emitter<Tuple3<V1, V2, V3>> emitter) {
            emitter.emit(input.first());
        }
    }, collection.getPType());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) PTypeFamily(org.apache.crunch.types.PTypeFamily) Tuple3(org.apache.crunch.Tuple3) GroupingOptions(org.apache.crunch.GroupingOptions) Pair(org.apache.crunch.Pair)

Aggregations

PTypeFamily (org.apache.crunch.types.PTypeFamily)26 Pair (org.apache.crunch.Pair)15 GroupingOptions (org.apache.crunch.GroupingOptions)7 MRPipeline (org.apache.crunch.impl.mr.MRPipeline)7 Test (org.junit.Test)7 Configuration (org.apache.hadoop.conf.Configuration)6 Collection (java.util.Collection)5 PCollection (org.apache.crunch.PCollection)4 CombineFn (org.apache.crunch.CombineFn)2 DoFn (org.apache.crunch.DoFn)2 Emitter (org.apache.crunch.Emitter)2 Tuple3 (org.apache.crunch.Tuple3)2 File (java.io.File)1 List (java.util.List)1 Tuple4 (org.apache.crunch.Tuple4)1 TupleN (org.apache.crunch.TupleN)1 MemPipeline (org.apache.crunch.impl.mem.MemPipeline)1 CrunchRuntimeException (org.apache.crunch.impl.mr.run.CrunchRuntimeException)1 ReadableSourceTarget (org.apache.crunch.io.ReadableSourceTarget)1 SourcePathTargetImpl (org.apache.crunch.io.impl.SourcePathTargetImpl)1