Search in sources :

Example 16 with Pair

use of org.apache.crunch.Pair in project crunch by cloudera.

the class Join method preJoin.

private static <K, U, V> PGroupedTable<Pair<K, Integer>, Pair<U, V>> preJoin(PTable<K, U> left, PTable<K, V> right) {
    PTypeFamily ptf = left.getTypeFamily();
    PTableType<Pair<K, Integer>, Pair<U, V>> ptt = ptf.tableOf(ptf.pairs(left.getKeyType(), ptf.ints()), ptf.pairs(left.getValueType(), right.getValueType()));
    PTable<Pair<K, Integer>, Pair<U, V>> tag1 = left.parallelDo("joinTagLeft", new MapFn<Pair<K, U>, Pair<Pair<K, Integer>, Pair<U, V>>>() {

        @Override
        public Pair<Pair<K, Integer>, Pair<U, V>> map(Pair<K, U> input) {
            return Pair.of(Pair.of(input.first(), 0), Pair.of(input.second(), (V) null));
        }
    }, ptt);
    PTable<Pair<K, Integer>, Pair<U, V>> tag2 = right.parallelDo("joinTagRight", new MapFn<Pair<K, V>, Pair<Pair<K, Integer>, Pair<U, V>>>() {

        @Override
        public Pair<Pair<K, Integer>, Pair<U, V>> map(Pair<K, V> input) {
            return Pair.of(Pair.of(input.first(), 1), Pair.of((U) null, input.second()));
        }
    }, ptt);
    GroupingOptions.Builder optionsBuilder = GroupingOptions.builder();
    optionsBuilder.partitionerClass(JoinUtils.getPartitionerClass(ptf));
    return (tag1.union(tag2)).groupByKey(optionsBuilder.build());
}
Also used : PTypeFamily(org.apache.crunch.types.PTypeFamily) GroupingOptions(org.apache.crunch.GroupingOptions) Pair(org.apache.crunch.Pair)

Example 17 with Pair

use of org.apache.crunch.Pair in project crunch by cloudera.

the class Set method comm.

/**
   * Find the elements that are common to two sets, like the Unix <code>comm</code>
   * utility. This method returns a {@link PCollection} of {@link Tuple3} objects,
   * and the position in the tuple that an element appears is determined by
   * the collections that it is a member of, as follows:
   * <ol>
   * <li>elements only in <code>coll1</code>,</li>
   * <li>elements only in <code>coll2</code>, or</li>
   * <li>elements in both collections</li>
   * </ol>
   * Tuples are otherwise filled with <code>null</code>.
   * 
   * @return a collection of {@link Tuple3} objects
   */
public static <T> PCollection<Tuple3<T, T, T>> comm(PCollection<T> coll1, PCollection<T> coll2) {
    PTypeFamily typeFamily = coll1.getTypeFamily();
    PType<T> type = coll1.getPType();
    return Cogroup.cogroup(toTable(coll1), toTable(coll2)).parallelDo(new DoFn<Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>>, Tuple3<T, T, T>>() {

        @Override
        public void process(Pair<T, Pair<Collection<Boolean>, Collection<Boolean>>> input, Emitter<Tuple3<T, T, T>> emitter) {
            Pair<Collection<Boolean>, Collection<Boolean>> groups = input.second();
            boolean inFirst = !groups.first().isEmpty();
            boolean inSecond = !groups.second().isEmpty();
            T t = input.first();
            emitter.emit(Tuple3.of(inFirst && !inSecond ? t : null, !inFirst && inSecond ? t : null, inFirst && inSecond ? t : null));
        }
    }, typeFamily.triples(type, type, type));
}
Also used : PTypeFamily(org.apache.crunch.types.PTypeFamily) Tuple3(org.apache.crunch.Tuple3) Collection(java.util.Collection) PCollection(org.apache.crunch.PCollection) Pair(org.apache.crunch.Pair)

Example 18 with Pair

use of org.apache.crunch.Pair in project crunch by cloudera.

the class Sort method sortTriples.

/**
   * Sorts the {@link PCollection} of {@link Tuple3}s using the specified column
   * ordering.
   * 
   * @return a {@link PCollection} representing the sorted collection.
   */
public static <V1, V2, V3> PCollection<Tuple3<V1, V2, V3>> sortTriples(PCollection<Tuple3<V1, V2, V3>> collection, ColumnOrder... columnOrders) {
    PTypeFamily tf = collection.getTypeFamily();
    PType<Tuple3<V1, V2, V3>> pType = collection.getPType();
    @SuppressWarnings("unchecked") PTableType<Tuple3<V1, V2, V3>, Void> type = tf.tableOf(tf.triples(pType.getSubTypes().get(0), pType.getSubTypes().get(1), pType.getSubTypes().get(2)), tf.nulls());
    PTable<Tuple3<V1, V2, V3>, Void> pt = collection.parallelDo(new DoFn<Tuple3<V1, V2, V3>, Pair<Tuple3<V1, V2, V3>, Void>>() {

        @Override
        public void process(Tuple3<V1, V2, V3> input, Emitter<Pair<Tuple3<V1, V2, V3>, Void>> emitter) {
            emitter.emit(Pair.of(input, (Void) null));
        }
    }, type);
    Configuration conf = collection.getPipeline().getConfiguration();
    GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders);
    PTable<Tuple3<V1, V2, V3>, Void> sortedPt = pt.groupByKey(options).ungroup();
    return sortedPt.parallelDo(new DoFn<Pair<Tuple3<V1, V2, V3>, Void>, Tuple3<V1, V2, V3>>() {

        @Override
        public void process(Pair<Tuple3<V1, V2, V3>, Void> input, Emitter<Tuple3<V1, V2, V3>> emitter) {
            emitter.emit(input.first());
        }
    }, collection.getPType());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) PTypeFamily(org.apache.crunch.types.PTypeFamily) Tuple3(org.apache.crunch.Tuple3) GroupingOptions(org.apache.crunch.GroupingOptions) Pair(org.apache.crunch.Pair)

Example 19 with Pair

use of org.apache.crunch.Pair in project crunch by cloudera.

the class Sort method sortPairs.

/**
   * Sorts the {@link PCollection} of {@link Pair}s using the specified column
   * ordering.
   * 
   * @return a {@link PCollection} representing the sorted collection.
   */
public static <U, V> PCollection<Pair<U, V>> sortPairs(PCollection<Pair<U, V>> collection, ColumnOrder... columnOrders) {
    // put U and V into a pair/tuple in the key so we can do grouping and sorting
    PTypeFamily tf = collection.getTypeFamily();
    PType<Pair<U, V>> pType = collection.getPType();
    @SuppressWarnings("unchecked") PTableType<Pair<U, V>, Void> type = tf.tableOf(tf.pairs(pType.getSubTypes().get(0), pType.getSubTypes().get(1)), tf.nulls());
    PTable<Pair<U, V>, Void> pt = collection.parallelDo(new DoFn<Pair<U, V>, Pair<Pair<U, V>, Void>>() {

        @Override
        public void process(Pair<U, V> input, Emitter<Pair<Pair<U, V>, Void>> emitter) {
            emitter.emit(Pair.of(input, (Void) null));
        }
    }, type);
    Configuration conf = collection.getPipeline().getConfiguration();
    GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders);
    PTable<Pair<U, V>, Void> sortedPt = pt.groupByKey(options).ungroup();
    return sortedPt.parallelDo(new DoFn<Pair<Pair<U, V>, Void>, Pair<U, V>>() {

        @Override
        public void process(Pair<Pair<U, V>, Void> input, Emitter<Pair<U, V>> emitter) {
            emitter.emit(input.first());
        }
    }, collection.getPType());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) PTypeFamily(org.apache.crunch.types.PTypeFamily) GroupingOptions(org.apache.crunch.GroupingOptions) Pair(org.apache.crunch.Pair)

Example 20 with Pair

use of org.apache.crunch.Pair in project crunch by cloudera.

the class Sort method sortTuples.

/**
   * Sorts the {@link PCollection} of {@link TupleN}s using the specified column
   * ordering.
   * 
   * @return a {@link PCollection} representing the sorted collection.
   */
public static PCollection<TupleN> sortTuples(PCollection<TupleN> collection, ColumnOrder... columnOrders) {
    PTypeFamily tf = collection.getTypeFamily();
    PType<TupleN> pType = collection.getPType();
    PTableType<TupleN, Void> type = tf.tableOf(tf.tuples(pType.getSubTypes().toArray(new PType[0])), tf.nulls());
    PTable<TupleN, Void> pt = collection.parallelDo(new DoFn<TupleN, Pair<TupleN, Void>>() {

        @Override
        public void process(TupleN input, Emitter<Pair<TupleN, Void>> emitter) {
            emitter.emit(Pair.of(input, (Void) null));
        }
    }, type);
    Configuration conf = collection.getPipeline().getConfiguration();
    GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders);
    PTable<TupleN, Void> sortedPt = pt.groupByKey(options).ungroup();
    return sortedPt.parallelDo(new DoFn<Pair<TupleN, Void>, TupleN>() {

        @Override
        public void process(Pair<TupleN, Void> input, Emitter<TupleN> emitter) {
            emitter.emit(input.first());
        }
    }, collection.getPType());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) PTypeFamily(org.apache.crunch.types.PTypeFamily) GroupingOptions(org.apache.crunch.GroupingOptions) TupleN(org.apache.crunch.TupleN) Pair(org.apache.crunch.Pair)

Aggregations

Pair (org.apache.crunch.Pair)22 PTypeFamily (org.apache.crunch.types.PTypeFamily)15 GroupingOptions (org.apache.crunch.GroupingOptions)6 Configuration (org.apache.hadoop.conf.Configuration)5 MRPipeline (org.apache.crunch.impl.mr.MRPipeline)4 Test (org.junit.Test)4 Collection (java.util.Collection)3 CombineFn (org.apache.crunch.CombineFn)3 Emitter (org.apache.crunch.Emitter)3 PCollection (org.apache.crunch.PCollection)3 Utf8 (org.apache.avro.util.Utf8)2 DoFn (org.apache.crunch.DoFn)2 Pipeline (org.apache.crunch.Pipeline)2 Tuple3 (org.apache.crunch.Tuple3)2 Path (org.apache.hadoop.fs.Path)2 DatasetRepository (com.cloudera.cdk.data.DatasetRepository)1 StandardEvent (com.cloudera.cdk.data.event.StandardEvent)1 FileSystemDatasetRepository (com.cloudera.cdk.data.filesystem.FileSystemDatasetRepository)1 Session (com.cloudera.cdk.examples.demo.event.Session)1 IOException (java.io.IOException)1