Search in sources :

Example 1 with GroupingOptions

use of org.apache.crunch.GroupingOptions in project crunch by cloudera.

the class Sort method sort.

/**
   * Sorts the {@link PTable} using the natural ordering of its keys
   * in the order specified.
   * 
   * @return a {@link PTable} representing the sorted collection.
   */
public static <K, V> PTable<K, V> sort(PTable<K, V> table, Order key) {
    PTypeFamily tf = table.getTypeFamily();
    Configuration conf = table.getPipeline().getConfiguration();
    GroupingOptions options = buildGroupingOptions(conf, tf, table.getKeyType(), key);
    return table.groupByKey(options).ungroup();
}
Also used : PTypeFamily(org.apache.crunch.types.PTypeFamily) Configuration(org.apache.hadoop.conf.Configuration) GroupingOptions(org.apache.crunch.GroupingOptions)

Example 2 with GroupingOptions

use of org.apache.crunch.GroupingOptions in project crunch by cloudera.

the class Sort method sortQuads.

/**
   * Sorts the {@link PCollection} of {@link Tuple4}s using the specified column
   * ordering.
   * 
   * @return a {@link PCollection} representing the sorted collection.
   */
public static <V1, V2, V3, V4> PCollection<Tuple4<V1, V2, V3, V4>> sortQuads(PCollection<Tuple4<V1, V2, V3, V4>> collection, ColumnOrder... columnOrders) {
    PTypeFamily tf = collection.getTypeFamily();
    PType<Tuple4<V1, V2, V3, V4>> pType = collection.getPType();
    @SuppressWarnings("unchecked") PTableType<Tuple4<V1, V2, V3, V4>, Void> type = tf.tableOf(tf.quads(pType.getSubTypes().get(0), pType.getSubTypes().get(1), pType.getSubTypes().get(2), pType.getSubTypes().get(3)), tf.nulls());
    PTable<Tuple4<V1, V2, V3, V4>, Void> pt = collection.parallelDo(new DoFn<Tuple4<V1, V2, V3, V4>, Pair<Tuple4<V1, V2, V3, V4>, Void>>() {

        @Override
        public void process(Tuple4<V1, V2, V3, V4> input, Emitter<Pair<Tuple4<V1, V2, V3, V4>, Void>> emitter) {
            emitter.emit(Pair.of(input, (Void) null));
        }
    }, type);
    Configuration conf = collection.getPipeline().getConfiguration();
    GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders);
    PTable<Tuple4<V1, V2, V3, V4>, Void> sortedPt = pt.groupByKey(options).ungroup();
    return sortedPt.parallelDo(new DoFn<Pair<Tuple4<V1, V2, V3, V4>, Void>, Tuple4<V1, V2, V3, V4>>() {

        @Override
        public void process(Pair<Tuple4<V1, V2, V3, V4>, Void> input, Emitter<Tuple4<V1, V2, V3, V4>> emitter) {
            emitter.emit(input.first());
        }
    }, collection.getPType());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) Tuple4(org.apache.crunch.Tuple4) PTypeFamily(org.apache.crunch.types.PTypeFamily) GroupingOptions(org.apache.crunch.GroupingOptions) Pair(org.apache.crunch.Pair)

Example 3 with GroupingOptions

use of org.apache.crunch.GroupingOptions in project crunch by cloudera.

the class Sort method sortTriples.

/**
   * Sorts the {@link PCollection} of {@link Tuple3}s using the specified column
   * ordering.
   * 
   * @return a {@link PCollection} representing the sorted collection.
   */
public static <V1, V2, V3> PCollection<Tuple3<V1, V2, V3>> sortTriples(PCollection<Tuple3<V1, V2, V3>> collection, ColumnOrder... columnOrders) {
    PTypeFamily tf = collection.getTypeFamily();
    PType<Tuple3<V1, V2, V3>> pType = collection.getPType();
    @SuppressWarnings("unchecked") PTableType<Tuple3<V1, V2, V3>, Void> type = tf.tableOf(tf.triples(pType.getSubTypes().get(0), pType.getSubTypes().get(1), pType.getSubTypes().get(2)), tf.nulls());
    PTable<Tuple3<V1, V2, V3>, Void> pt = collection.parallelDo(new DoFn<Tuple3<V1, V2, V3>, Pair<Tuple3<V1, V2, V3>, Void>>() {

        @Override
        public void process(Tuple3<V1, V2, V3> input, Emitter<Pair<Tuple3<V1, V2, V3>, Void>> emitter) {
            emitter.emit(Pair.of(input, (Void) null));
        }
    }, type);
    Configuration conf = collection.getPipeline().getConfiguration();
    GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders);
    PTable<Tuple3<V1, V2, V3>, Void> sortedPt = pt.groupByKey(options).ungroup();
    return sortedPt.parallelDo(new DoFn<Pair<Tuple3<V1, V2, V3>, Void>, Tuple3<V1, V2, V3>>() {

        @Override
        public void process(Pair<Tuple3<V1, V2, V3>, Void> input, Emitter<Tuple3<V1, V2, V3>> emitter) {
            emitter.emit(input.first());
        }
    }, collection.getPType());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) PTypeFamily(org.apache.crunch.types.PTypeFamily) Tuple3(org.apache.crunch.Tuple3) GroupingOptions(org.apache.crunch.GroupingOptions) Pair(org.apache.crunch.Pair)

Example 4 with GroupingOptions

use of org.apache.crunch.GroupingOptions in project crunch by cloudera.

the class Sort method sortPairs.

/**
   * Sorts the {@link PCollection} of {@link Pair}s using the specified column
   * ordering.
   * 
   * @return a {@link PCollection} representing the sorted collection.
   */
public static <U, V> PCollection<Pair<U, V>> sortPairs(PCollection<Pair<U, V>> collection, ColumnOrder... columnOrders) {
    // put U and V into a pair/tuple in the key so we can do grouping and sorting
    PTypeFamily tf = collection.getTypeFamily();
    PType<Pair<U, V>> pType = collection.getPType();
    @SuppressWarnings("unchecked") PTableType<Pair<U, V>, Void> type = tf.tableOf(tf.pairs(pType.getSubTypes().get(0), pType.getSubTypes().get(1)), tf.nulls());
    PTable<Pair<U, V>, Void> pt = collection.parallelDo(new DoFn<Pair<U, V>, Pair<Pair<U, V>, Void>>() {

        @Override
        public void process(Pair<U, V> input, Emitter<Pair<Pair<U, V>, Void>> emitter) {
            emitter.emit(Pair.of(input, (Void) null));
        }
    }, type);
    Configuration conf = collection.getPipeline().getConfiguration();
    GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders);
    PTable<Pair<U, V>, Void> sortedPt = pt.groupByKey(options).ungroup();
    return sortedPt.parallelDo(new DoFn<Pair<Pair<U, V>, Void>, Pair<U, V>>() {

        @Override
        public void process(Pair<Pair<U, V>, Void> input, Emitter<Pair<U, V>> emitter) {
            emitter.emit(input.first());
        }
    }, collection.getPType());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) PTypeFamily(org.apache.crunch.types.PTypeFamily) GroupingOptions(org.apache.crunch.GroupingOptions) Pair(org.apache.crunch.Pair)

Example 5 with GroupingOptions

use of org.apache.crunch.GroupingOptions in project crunch by cloudera.

the class Sort method sortTuples.

/**
   * Sorts the {@link PCollection} of {@link TupleN}s using the specified column
   * ordering.
   * 
   * @return a {@link PCollection} representing the sorted collection.
   */
public static PCollection<TupleN> sortTuples(PCollection<TupleN> collection, ColumnOrder... columnOrders) {
    PTypeFamily tf = collection.getTypeFamily();
    PType<TupleN> pType = collection.getPType();
    PTableType<TupleN, Void> type = tf.tableOf(tf.tuples(pType.getSubTypes().toArray(new PType[0])), tf.nulls());
    PTable<TupleN, Void> pt = collection.parallelDo(new DoFn<TupleN, Pair<TupleN, Void>>() {

        @Override
        public void process(TupleN input, Emitter<Pair<TupleN, Void>> emitter) {
            emitter.emit(Pair.of(input, (Void) null));
        }
    }, type);
    Configuration conf = collection.getPipeline().getConfiguration();
    GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders);
    PTable<TupleN, Void> sortedPt = pt.groupByKey(options).ungroup();
    return sortedPt.parallelDo(new DoFn<Pair<TupleN, Void>, TupleN>() {

        @Override
        public void process(Pair<TupleN, Void> input, Emitter<TupleN> emitter) {
            emitter.emit(input.first());
        }
    }, collection.getPType());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) PTypeFamily(org.apache.crunch.types.PTypeFamily) GroupingOptions(org.apache.crunch.GroupingOptions) TupleN(org.apache.crunch.TupleN) Pair(org.apache.crunch.Pair)

Aggregations

GroupingOptions (org.apache.crunch.GroupingOptions)6 PTypeFamily (org.apache.crunch.types.PTypeFamily)6 Configuration (org.apache.hadoop.conf.Configuration)6 Pair (org.apache.crunch.Pair)5 Tuple3 (org.apache.crunch.Tuple3)1 Tuple4 (org.apache.crunch.Tuple4)1 TupleN (org.apache.crunch.TupleN)1