Search in sources :

Example 6 with PTypeFamily

use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.

the class Cartesian method cross.

/**
 * Performs a full cross join on the specified {@link PTable}s (using the same strategy as Pig's CROSS operator).
 *
 * @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Cross_join">Cross Join</a>
 * @param left A PTable to perform a cross join on.
 * @param right A PTable to perform a cross join on.
 * @param parallelism The square root of the number of reducers to use.  Increasing parallelism also increases copied data.
 * @param <K1> Type of left PTable's keys.
 * @param <K2> Type of right PTable's keys.
 * @param <U> Type of the first {@link PTable}'s values
 * @param <V> Type of the second {@link PTable}'s values
 * @return The joined result as tuples of ((K1,K2), (U,V)).
 */
public static <K1, K2, U, V> PTable<Pair<K1, K2>, Pair<U, V>> cross(PTable<K1, U> left, PTable<K2, V> right, int parallelism) {
    /* The strategy here is to simply emulate the following PigLatin:
     *   A  = foreach table1 generate flatten(GFCross(0, 2)), flatten(*); 
     *   B  = foreach table2 generate flatten(GFCross(1, 2)), flatten(*); 
     *   C = cogroup A by ($0, $1), B by ($0, $1);
     *   result = foreach C generate flatten(A), flatten(B);
     */
    PTypeFamily ltf = left.getTypeFamily();
    PTypeFamily rtf = right.getTypeFamily();
    PTable<Pair<Integer, Integer>, Pair<K1, U>> leftCross = left.parallelDo(new GFCross<Pair<K1, U>>(0, parallelism), ltf.tableOf(ltf.pairs(ltf.ints(), ltf.ints()), ltf.pairs(left.getKeyType(), left.getValueType())));
    PTable<Pair<Integer, Integer>, Pair<K2, V>> rightCross = right.parallelDo(new GFCross<Pair<K2, V>>(1, parallelism), rtf.tableOf(rtf.pairs(rtf.ints(), rtf.ints()), rtf.pairs(right.getKeyType(), right.getValueType())));
    PTable<Pair<Integer, Integer>, Pair<Collection<Pair<K1, U>>, Collection<Pair<K2, V>>>> cg = leftCross.cogroup(rightCross);
    PTypeFamily ctf = cg.getTypeFamily();
    return cg.parallelDo(new DoFn<Pair<Pair<Integer, Integer>, Pair<Collection<Pair<K1, U>>, Collection<Pair<K2, V>>>>, Pair<Pair<K1, K2>, Pair<U, V>>>() {

        @Override
        public void process(Pair<Pair<Integer, Integer>, Pair<Collection<Pair<K1, U>>, Collection<Pair<K2, V>>>> input, Emitter<Pair<Pair<K1, K2>, Pair<U, V>>> emitter) {
            for (Pair<K1, U> l : input.second().first()) {
                for (Pair<K2, V> r : input.second().second()) {
                    emitter.emit(Pair.of(Pair.of(l.first(), r.first()), Pair.of(l.second(), r.second())));
                }
            }
        }
    }, ctf.tableOf(ctf.pairs(left.getKeyType(), right.getKeyType()), ctf.pairs(left.getValueType(), right.getValueType())));
}
Also used : PTypeFamily(org.apache.crunch.types.PTypeFamily) Collection(java.util.Collection) PCollection(org.apache.crunch.PCollection) Pair(org.apache.crunch.Pair)

Example 7 with PTypeFamily

use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.

the class Cogroup method cogroup.

/**
 * Co-groups the two {@link PTable} arguments.
 *
 * @return a {@code PTable} representing the co-grouped tables.
 */
public static <K, U, V> PTable<K, Pair<Collection<U>, Collection<V>>> cogroup(PTable<K, U> left, PTable<K, V> right) {
    PTypeFamily ptf = left.getTypeFamily();
    PType<K> keyType = left.getPTableType().getKeyType();
    PType<U> leftType = left.getPTableType().getValueType();
    PType<V> rightType = right.getPTableType().getValueType();
    PType<Pair<U, V>> itype = ptf.pairs(leftType, rightType);
    PTable<K, Pair<U, V>> cgLeft = left.parallelDo("coGroupTag1", new CogroupFn1<K, U, V>(), ptf.tableOf(keyType, itype));
    PTable<K, Pair<U, V>> cgRight = right.parallelDo("coGroupTag2", new CogroupFn2<K, U, V>(), ptf.tableOf(keyType, itype));
    PTable<K, Pair<U, V>> both = cgLeft.union(cgRight);
    PType<Pair<Collection<U>, Collection<V>>> otype = ptf.pairs(ptf.collections(leftType), ptf.collections(rightType));
    return both.groupByKey().parallelDo("cogroup", new PostGroupFn<K, U, V>(), ptf.tableOf(keyType, otype));
}
Also used : PTypeFamily(org.apache.crunch.types.PTypeFamily) Pair(org.apache.crunch.Pair)

Example 8 with PTypeFamily

use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.

the class Sort method sort.

/**
 * Sorts the {@link PTable} using the natural ordering of its keys
 * in the order specified.
 *
 * @return a {@link PTable} representing the sorted collection.
 */
public static <K, V> PTable<K, V> sort(PTable<K, V> table, Order key) {
    PTypeFamily tf = table.getTypeFamily();
    Configuration conf = table.getPipeline().getConfiguration();
    GroupingOptions options = buildGroupingOptions(conf, tf, table.getKeyType(), key);
    return table.groupByKey(options).ungroup();
}
Also used : PTypeFamily(org.apache.crunch.types.PTypeFamily) Configuration(org.apache.hadoop.conf.Configuration) GroupingOptions(org.apache.crunch.GroupingOptions)

Example 9 with PTypeFamily

use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.

the class Sort method sortQuads.

/**
 * Sorts the {@link PCollection} of {@link Tuple4}s using the specified column
 * ordering.
 *
 * @return a {@link PCollection} representing the sorted collection.
 */
public static <V1, V2, V3, V4> PCollection<Tuple4<V1, V2, V3, V4>> sortQuads(PCollection<Tuple4<V1, V2, V3, V4>> collection, ColumnOrder... columnOrders) {
    PTypeFamily tf = collection.getTypeFamily();
    PType<Tuple4<V1, V2, V3, V4>> pType = collection.getPType();
    @SuppressWarnings("unchecked") PTableType<Tuple4<V1, V2, V3, V4>, Void> type = tf.tableOf(tf.quads(pType.getSubTypes().get(0), pType.getSubTypes().get(1), pType.getSubTypes().get(2), pType.getSubTypes().get(3)), tf.nulls());
    PTable<Tuple4<V1, V2, V3, V4>, Void> pt = collection.parallelDo(new DoFn<Tuple4<V1, V2, V3, V4>, Pair<Tuple4<V1, V2, V3, V4>, Void>>() {

        @Override
        public void process(Tuple4<V1, V2, V3, V4> input, Emitter<Pair<Tuple4<V1, V2, V3, V4>, Void>> emitter) {
            emitter.emit(Pair.of(input, (Void) null));
        }
    }, type);
    Configuration conf = collection.getPipeline().getConfiguration();
    GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders);
    PTable<Tuple4<V1, V2, V3, V4>, Void> sortedPt = pt.groupByKey(options).ungroup();
    return sortedPt.parallelDo(new DoFn<Pair<Tuple4<V1, V2, V3, V4>, Void>, Tuple4<V1, V2, V3, V4>>() {

        @Override
        public void process(Pair<Tuple4<V1, V2, V3, V4>, Void> input, Emitter<Tuple4<V1, V2, V3, V4>> emitter) {
            emitter.emit(input.first());
        }
    }, collection.getPType());
}
Also used : Configuration(org.apache.hadoop.conf.Configuration) Tuple4(org.apache.crunch.Tuple4) PTypeFamily(org.apache.crunch.types.PTypeFamily) GroupingOptions(org.apache.crunch.GroupingOptions) Pair(org.apache.crunch.Pair)

Example 10 with PTypeFamily

use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.

the class MapsideJoin method join.

/**
 * Join two tables using a map side join. The right-side table will be loaded
 * fully in memory, so this method should only be used if the right side
 * table's contents can fit in the memory allocated to mappers. The join
 * performed by this method is an inner join.
 *
 * @param left
 *          The left-side table of the join
 * @param right
 *          The right-side table of the join, whose contents will be fully
 *          read into memory
 * @return A table keyed on the join key, containing pairs of joined values
 */
public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right) {
    if (!(right.getPipeline() instanceof MRPipeline)) {
        throw new CrunchRuntimeException("Map-side join is only supported within a MapReduce context");
    }
    MRPipeline pipeline = (MRPipeline) right.getPipeline();
    pipeline.materialize(right);
    // TODO Move necessary logic to MRPipeline so that we can theoretically
    // optimize his by running the setup of multiple map-side joins concurrently
    pipeline.run();
    ReadableSourceTarget<Pair<K, V>> readableSourceTarget = pipeline.getMaterializeSourceTarget(right);
    if (!(readableSourceTarget instanceof SourcePathTargetImpl)) {
        throw new CrunchRuntimeException("Right-side contents can't be read from a path");
    }
    // Suppress warnings because we've just checked this cast via instanceof
    @SuppressWarnings("unchecked") SourcePathTargetImpl<Pair<K, V>> sourcePathTarget = (SourcePathTargetImpl<Pair<K, V>>) readableSourceTarget;
    Path path = sourcePathTarget.getPath();
    DistributedCache.addCacheFile(path.toUri(), pipeline.getConfiguration());
    MapsideJoinDoFn<K, U, V> mapJoinDoFn = new MapsideJoinDoFn<K, U, V>(path.toString(), right.getPType());
    PTypeFamily typeFamily = left.getTypeFamily();
    return left.parallelDo("mapjoin", mapJoinDoFn, typeFamily.tableOf(left.getKeyType(), typeFamily.pairs(left.getValueType(), right.getValueType())));
}
Also used : Path(org.apache.hadoop.fs.Path) MRPipeline(org.apache.crunch.impl.mr.MRPipeline) PTypeFamily(org.apache.crunch.types.PTypeFamily) SourcePathTargetImpl(org.apache.crunch.io.impl.SourcePathTargetImpl) CrunchRuntimeException(org.apache.crunch.impl.mr.run.CrunchRuntimeException) Pair(org.apache.crunch.Pair)

Aggregations

PTypeFamily (org.apache.crunch.types.PTypeFamily)26 Pair (org.apache.crunch.Pair)15 GroupingOptions (org.apache.crunch.GroupingOptions)7 MRPipeline (org.apache.crunch.impl.mr.MRPipeline)7 Test (org.junit.Test)7 Configuration (org.apache.hadoop.conf.Configuration)6 Collection (java.util.Collection)5 PCollection (org.apache.crunch.PCollection)4 CombineFn (org.apache.crunch.CombineFn)2 DoFn (org.apache.crunch.DoFn)2 Emitter (org.apache.crunch.Emitter)2 Tuple3 (org.apache.crunch.Tuple3)2 File (java.io.File)1 List (java.util.List)1 Tuple4 (org.apache.crunch.Tuple4)1 TupleN (org.apache.crunch.TupleN)1 MemPipeline (org.apache.crunch.impl.mem.MemPipeline)1 CrunchRuntimeException (org.apache.crunch.impl.mr.run.CrunchRuntimeException)1 ReadableSourceTarget (org.apache.crunch.io.ReadableSourceTarget)1 SourcePathTargetImpl (org.apache.crunch.io.impl.SourcePathTargetImpl)1