use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.
the class Cartesian method cross.
/**
* Performs a full cross join on the specified {@link PTable}s (using the same strategy as Pig's CROSS operator).
*
* @see <a href="http://en.wikipedia.org/wiki/Join_(SQL)#Cross_join">Cross Join</a>
* @param left A PTable to perform a cross join on.
* @param right A PTable to perform a cross join on.
* @param parallelism The square root of the number of reducers to use. Increasing parallelism also increases copied data.
* @param <K1> Type of left PTable's keys.
* @param <K2> Type of right PTable's keys.
* @param <U> Type of the first {@link PTable}'s values
* @param <V> Type of the second {@link PTable}'s values
* @return The joined result as tuples of ((K1,K2), (U,V)).
*/
public static <K1, K2, U, V> PTable<Pair<K1, K2>, Pair<U, V>> cross(PTable<K1, U> left, PTable<K2, V> right, int parallelism) {
/* The strategy here is to simply emulate the following PigLatin:
* A = foreach table1 generate flatten(GFCross(0, 2)), flatten(*);
* B = foreach table2 generate flatten(GFCross(1, 2)), flatten(*);
* C = cogroup A by ($0, $1), B by ($0, $1);
* result = foreach C generate flatten(A), flatten(B);
*/
PTypeFamily ltf = left.getTypeFamily();
PTypeFamily rtf = right.getTypeFamily();
PTable<Pair<Integer, Integer>, Pair<K1, U>> leftCross = left.parallelDo(new GFCross<Pair<K1, U>>(0, parallelism), ltf.tableOf(ltf.pairs(ltf.ints(), ltf.ints()), ltf.pairs(left.getKeyType(), left.getValueType())));
PTable<Pair<Integer, Integer>, Pair<K2, V>> rightCross = right.parallelDo(new GFCross<Pair<K2, V>>(1, parallelism), rtf.tableOf(rtf.pairs(rtf.ints(), rtf.ints()), rtf.pairs(right.getKeyType(), right.getValueType())));
PTable<Pair<Integer, Integer>, Pair<Collection<Pair<K1, U>>, Collection<Pair<K2, V>>>> cg = leftCross.cogroup(rightCross);
PTypeFamily ctf = cg.getTypeFamily();
return cg.parallelDo(new DoFn<Pair<Pair<Integer, Integer>, Pair<Collection<Pair<K1, U>>, Collection<Pair<K2, V>>>>, Pair<Pair<K1, K2>, Pair<U, V>>>() {
@Override
public void process(Pair<Pair<Integer, Integer>, Pair<Collection<Pair<K1, U>>, Collection<Pair<K2, V>>>> input, Emitter<Pair<Pair<K1, K2>, Pair<U, V>>> emitter) {
for (Pair<K1, U> l : input.second().first()) {
for (Pair<K2, V> r : input.second().second()) {
emitter.emit(Pair.of(Pair.of(l.first(), r.first()), Pair.of(l.second(), r.second())));
}
}
}
}, ctf.tableOf(ctf.pairs(left.getKeyType(), right.getKeyType()), ctf.pairs(left.getValueType(), right.getValueType())));
}
use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.
the class Cogroup method cogroup.
/**
* Co-groups the two {@link PTable} arguments.
*
* @return a {@code PTable} representing the co-grouped tables.
*/
public static <K, U, V> PTable<K, Pair<Collection<U>, Collection<V>>> cogroup(PTable<K, U> left, PTable<K, V> right) {
PTypeFamily ptf = left.getTypeFamily();
PType<K> keyType = left.getPTableType().getKeyType();
PType<U> leftType = left.getPTableType().getValueType();
PType<V> rightType = right.getPTableType().getValueType();
PType<Pair<U, V>> itype = ptf.pairs(leftType, rightType);
PTable<K, Pair<U, V>> cgLeft = left.parallelDo("coGroupTag1", new CogroupFn1<K, U, V>(), ptf.tableOf(keyType, itype));
PTable<K, Pair<U, V>> cgRight = right.parallelDo("coGroupTag2", new CogroupFn2<K, U, V>(), ptf.tableOf(keyType, itype));
PTable<K, Pair<U, V>> both = cgLeft.union(cgRight);
PType<Pair<Collection<U>, Collection<V>>> otype = ptf.pairs(ptf.collections(leftType), ptf.collections(rightType));
return both.groupByKey().parallelDo("cogroup", new PostGroupFn<K, U, V>(), ptf.tableOf(keyType, otype));
}
use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.
the class Sort method sort.
/**
* Sorts the {@link PTable} using the natural ordering of its keys
* in the order specified.
*
* @return a {@link PTable} representing the sorted collection.
*/
public static <K, V> PTable<K, V> sort(PTable<K, V> table, Order key) {
PTypeFamily tf = table.getTypeFamily();
Configuration conf = table.getPipeline().getConfiguration();
GroupingOptions options = buildGroupingOptions(conf, tf, table.getKeyType(), key);
return table.groupByKey(options).ungroup();
}
use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.
the class Sort method sortQuads.
/**
* Sorts the {@link PCollection} of {@link Tuple4}s using the specified column
* ordering.
*
* @return a {@link PCollection} representing the sorted collection.
*/
public static <V1, V2, V3, V4> PCollection<Tuple4<V1, V2, V3, V4>> sortQuads(PCollection<Tuple4<V1, V2, V3, V4>> collection, ColumnOrder... columnOrders) {
PTypeFamily tf = collection.getTypeFamily();
PType<Tuple4<V1, V2, V3, V4>> pType = collection.getPType();
@SuppressWarnings("unchecked") PTableType<Tuple4<V1, V2, V3, V4>, Void> type = tf.tableOf(tf.quads(pType.getSubTypes().get(0), pType.getSubTypes().get(1), pType.getSubTypes().get(2), pType.getSubTypes().get(3)), tf.nulls());
PTable<Tuple4<V1, V2, V3, V4>, Void> pt = collection.parallelDo(new DoFn<Tuple4<V1, V2, V3, V4>, Pair<Tuple4<V1, V2, V3, V4>, Void>>() {
@Override
public void process(Tuple4<V1, V2, V3, V4> input, Emitter<Pair<Tuple4<V1, V2, V3, V4>, Void>> emitter) {
emitter.emit(Pair.of(input, (Void) null));
}
}, type);
Configuration conf = collection.getPipeline().getConfiguration();
GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders);
PTable<Tuple4<V1, V2, V3, V4>, Void> sortedPt = pt.groupByKey(options).ungroup();
return sortedPt.parallelDo(new DoFn<Pair<Tuple4<V1, V2, V3, V4>, Void>, Tuple4<V1, V2, V3, V4>>() {
@Override
public void process(Pair<Tuple4<V1, V2, V3, V4>, Void> input, Emitter<Tuple4<V1, V2, V3, V4>> emitter) {
emitter.emit(input.first());
}
}, collection.getPType());
}
use of org.apache.crunch.types.PTypeFamily in project crunch by cloudera.
the class MapsideJoin method join.
/**
* Join two tables using a map side join. The right-side table will be loaded
* fully in memory, so this method should only be used if the right side
* table's contents can fit in the memory allocated to mappers. The join
* performed by this method is an inner join.
*
* @param left
* The left-side table of the join
* @param right
* The right-side table of the join, whose contents will be fully
* read into memory
* @return A table keyed on the join key, containing pairs of joined values
*/
public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right) {
if (!(right.getPipeline() instanceof MRPipeline)) {
throw new CrunchRuntimeException("Map-side join is only supported within a MapReduce context");
}
MRPipeline pipeline = (MRPipeline) right.getPipeline();
pipeline.materialize(right);
// TODO Move necessary logic to MRPipeline so that we can theoretically
// optimize his by running the setup of multiple map-side joins concurrently
pipeline.run();
ReadableSourceTarget<Pair<K, V>> readableSourceTarget = pipeline.getMaterializeSourceTarget(right);
if (!(readableSourceTarget instanceof SourcePathTargetImpl)) {
throw new CrunchRuntimeException("Right-side contents can't be read from a path");
}
// Suppress warnings because we've just checked this cast via instanceof
@SuppressWarnings("unchecked") SourcePathTargetImpl<Pair<K, V>> sourcePathTarget = (SourcePathTargetImpl<Pair<K, V>>) readableSourceTarget;
Path path = sourcePathTarget.getPath();
DistributedCache.addCacheFile(path.toUri(), pipeline.getConfiguration());
MapsideJoinDoFn<K, U, V> mapJoinDoFn = new MapsideJoinDoFn<K, U, V>(path.toString(), right.getPType());
PTypeFamily typeFamily = left.getTypeFamily();
return left.parallelDo("mapjoin", mapJoinDoFn, typeFamily.tableOf(left.getKeyType(), typeFamily.pairs(left.getValueType(), right.getValueType())));
}
Aggregations