use of org.apache.crunch.TupleN in project crunch by cloudera.
the class SortTest method runTupleN.
private void runTupleN(Pipeline pipeline, PTypeFamily typeFamily, ColumnOrder[] orders, String[] fields) throws IOException {
String inputPath = FileHelper.createTempCopyOf("docs.txt");
PCollection<String> input = pipeline.readTextFile(inputPath);
PType[] types = new PType[orders.length];
Arrays.fill(types, typeFamily.strings());
PCollection<TupleN> kv = input.parallelDo(new DoFn<String, TupleN>() {
@Override
public void process(String input, Emitter<TupleN> emitter) {
String[] split = input.split("[\t]+");
emitter.emit(new TupleN(split));
}
}, typeFamily.tuples(types));
PCollection<TupleN> sorted = Sort.sortTuples(kv, orders);
Iterable<TupleN> lines = sorted.materialize();
TupleN l = lines.iterator().next();
int i = 0;
for (String field : fields) {
assertEquals(field, l.get(i++));
}
pipeline.done();
}
use of org.apache.crunch.TupleN in project crunch by cloudera.
the class WritablesTest method testTupleN.
@Test
public void testTupleN() throws Exception {
TupleN j = new TupleN("a", "b", "c", "d", "e");
TupleWritable w = new TupleWritable(new Text[] { new Text("a"), new Text("b"), new Text("c"), new Text("d"), new Text("e") });
w.setWritten(0);
w.setWritten(1);
w.setWritten(2);
w.setWritten(3);
w.setWritten(4);
WritableType<?, ?> wt = Writables.tuples(Writables.strings(), Writables.strings(), Writables.strings(), Writables.strings(), Writables.strings());
testInputOutputFn(wt, j, w);
}
use of org.apache.crunch.TupleN in project crunch by cloudera.
the class AvrosTest method testTupleN.
@Test
@SuppressWarnings("rawtypes")
public void testTupleN() throws Exception {
AvroType at = Avros.tuples(Avros.strings(), Avros.strings(), Avros.strings(), Avros.strings(), Avros.strings());
TupleN j = new TupleN("a", "b", "c", "d", "e");
GenericData.Record w = new GenericData.Record(at.getSchema());
w.put(0, new Utf8("a"));
w.put(1, new Utf8("b"));
w.put(2, new Utf8("c"));
w.put(3, new Utf8("d"));
w.put(4, new Utf8("e"));
testInputOutputFn(at, j, w);
}
use of org.apache.crunch.TupleN in project crunch by cloudera.
the class Sort method sortTuples.
/**
* Sorts the {@link PCollection} of {@link TupleN}s using the specified column
* ordering.
*
* @return a {@link PCollection} representing the sorted collection.
*/
public static PCollection<TupleN> sortTuples(PCollection<TupleN> collection, ColumnOrder... columnOrders) {
PTypeFamily tf = collection.getTypeFamily();
PType<TupleN> pType = collection.getPType();
PTableType<TupleN, Void> type = tf.tableOf(tf.tuples(pType.getSubTypes().toArray(new PType[0])), tf.nulls());
PTable<TupleN, Void> pt = collection.parallelDo(new DoFn<TupleN, Pair<TupleN, Void>>() {
@Override
public void process(TupleN input, Emitter<Pair<TupleN, Void>> emitter) {
emitter.emit(Pair.of(input, (Void) null));
}
}, type);
Configuration conf = collection.getPipeline().getConfiguration();
GroupingOptions options = buildGroupingOptions(conf, tf, pType, columnOrders);
PTable<TupleN, Void> sortedPt = pt.groupByKey(options).ungroup();
return sortedPt.parallelDo(new DoFn<Pair<TupleN, Void>, TupleN>() {
@Override
public void process(Pair<TupleN, Void> input, Emitter<TupleN> emitter) {
emitter.emit(input.first());
}
}, collection.getPType());
}
Aggregations