use of org.apache.crunch.Pair in project crunch by cloudera.
the class MemPipeline method write.
@Override
public void write(PCollection<?> collection, Target target) {
if (target instanceof PathTarget) {
Path path = ((PathTarget) target).getPath();
try {
FileSystem fs = FileSystem.get(conf);
FSDataOutputStream os = fs.create(new Path(path, "out"));
if (collection instanceof PTable) {
for (Object o : collection.materialize()) {
Pair p = (Pair) o;
os.writeBytes(p.first().toString());
os.writeBytes("\t");
os.writeBytes(p.second().toString());
os.writeBytes("\r\n");
}
} else {
for (Object o : collection.materialize()) {
os.writeBytes(o.toString() + "\r\n");
}
}
os.close();
} catch (IOException e) {
LOG.error("Exception writing target: " + target, e);
}
} else {
LOG.error("Target " + target + " is not a PathTarget instance");
}
}
use of org.apache.crunch.Pair in project crunch by cloudera.
the class MapsideJoinTest method testMapsideJoin_RightSideIsEmpty.
@Test
public void testMapsideJoin_RightSideIsEmpty() throws IOException {
MRPipeline pipeline = new MRPipeline(MapsideJoinTest.class);
PTable<Integer, String> customerTable = readTable(pipeline, "customers.txt");
PTable<Integer, String> orderTable = readTable(pipeline, "orders.txt");
PTable<Integer, String> filteredOrderTable = orderTable.parallelDo(new NegativeFilter(), orderTable.getPTableType());
PTable<Integer, Pair<String, String>> joined = MapsideJoin.join(customerTable, filteredOrderTable);
List<Pair<Integer, Pair<String, String>>> materializedJoin = Lists.newArrayList(joined.materialize());
assertTrue(materializedJoin.isEmpty());
}
use of org.apache.crunch.Pair in project crunch by cloudera.
the class AvrosTest method testPairs.
@Test
public void testPairs() throws Exception {
AvroType<Pair<String, String>> at = Avros.pairs(Avros.strings(), Avros.strings());
Pair<String, String> j = Pair.of("a", "b");
GenericData.Record w = new GenericData.Record(at.getSchema());
w.put(0, new Utf8("a"));
w.put(1, new Utf8("b"));
testInputOutputFn(at, j, w);
}
use of org.apache.crunch.Pair in project crunch by cloudera.
the class Sort method sortQuads.
/**
* Sorts the {@link PCollection} of {@link Tuple4}s using the specified column
* ordering.
*
* @return a {@link PCollection} representing the sorted collection.
*/
public static <V1, V2, V3, V4> PCollection<Tuple4<V1, V2, V3, V4>> sortQuads(PCollection<Tuple4<V1, V2, V3, V4>> collection, ColumnOrder... columnOrders) {
PTypeFamily tf = collection.getTypeFamily();
PType<Tuple4<V1, V2, V3, V4>> pType = collection.getPType();
@SuppressWarnings("unchecked") PTableType<Tuple4<V1, V2, V3, V4>, Void> type = tf.tableOf(tf.quads(pType.getSubTypes().get(0), pType.getSubTypes().get(1), pType.getSubTypes().get(2), pType.getSubTypes().get(3)), tf.nulls());
PTable<Tuple4<V1, V2, V3, V4>, Void> pt = collection.parallelDo(new DoFn<Tuple4<V1, V2, V3, V4>, Pair<Tuple4<V1, V2, V3, V4>, Void>>() {
@Override
public void process(Tuple4<V1, V2, V3, V4> input, Emitter<Pair<Tuple4<V1, V2, V3, V4>, Void>> emitter) {
emitter.emit(Pair.of(input, (Void) null));
}
}, type);
Configuration conf = collection.getPipeline().getConfiguration();
GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders);
PTable<Tuple4<V1, V2, V3, V4>, Void> sortedPt = pt.groupByKey(options).ungroup();
return sortedPt.parallelDo(new DoFn<Pair<Tuple4<V1, V2, V3, V4>, Void>, Tuple4<V1, V2, V3, V4>>() {
@Override
public void process(Pair<Tuple4<V1, V2, V3, V4>, Void> input, Emitter<Tuple4<V1, V2, V3, V4>> emitter) {
emitter.emit(input.first());
}
}, collection.getPType());
}
use of org.apache.crunch.Pair in project crunch by cloudera.
the class MapsideJoin method join.
/**
* Join two tables using a map side join. The right-side table will be loaded
* fully in memory, so this method should only be used if the right side
* table's contents can fit in the memory allocated to mappers. The join
* performed by this method is an inner join.
*
* @param left
* The left-side table of the join
* @param right
* The right-side table of the join, whose contents will be fully
* read into memory
* @return A table keyed on the join key, containing pairs of joined values
*/
public static <K, U, V> PTable<K, Pair<U, V>> join(PTable<K, U> left, PTable<K, V> right) {
if (!(right.getPipeline() instanceof MRPipeline)) {
throw new CrunchRuntimeException("Map-side join is only supported within a MapReduce context");
}
MRPipeline pipeline = (MRPipeline) right.getPipeline();
pipeline.materialize(right);
// TODO Move necessary logic to MRPipeline so that we can theoretically
// optimize his by running the setup of multiple map-side joins concurrently
pipeline.run();
ReadableSourceTarget<Pair<K, V>> readableSourceTarget = pipeline.getMaterializeSourceTarget(right);
if (!(readableSourceTarget instanceof SourcePathTargetImpl)) {
throw new CrunchRuntimeException("Right-side contents can't be read from a path");
}
// Suppress warnings because we've just checked this cast via instanceof
@SuppressWarnings("unchecked") SourcePathTargetImpl<Pair<K, V>> sourcePathTarget = (SourcePathTargetImpl<Pair<K, V>>) readableSourceTarget;
Path path = sourcePathTarget.getPath();
DistributedCache.addCacheFile(path.toUri(), pipeline.getConfiguration());
MapsideJoinDoFn<K, U, V> mapJoinDoFn = new MapsideJoinDoFn<K, U, V>(path.toString(), right.getPType());
PTypeFamily typeFamily = left.getTypeFamily();
return left.parallelDo("mapjoin", mapJoinDoFn, typeFamily.tableOf(left.getKeyType(), typeFamily.pairs(left.getValueType(), right.getValueType())));
}
Aggregations