use of org.apache.crunch.impl.mr.MRPipeline in project crunch by cloudera.
the class AggregateTest method testWritables.
@Test
public void testWritables() throws Exception {
Pipeline pipeline = new MRPipeline(AggregateTest.class);
String shakesInputPath = FileHelper.createTempCopyOf("shakes.txt");
PCollection<String> shakes = pipeline.readTextFile(shakesInputPath);
runMinMax(shakes, WritableTypeFamily.getInstance());
pipeline.done();
}
use of org.apache.crunch.impl.mr.MRPipeline in project crunch by cloudera.
the class WordCount method run.
public int run(String[] args) throws Exception {
if (args.length != 3) {
System.err.println();
System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output");
System.err.println();
GenericOptionsParser.printGenericCommandUsage(System.err);
return 1;
}
// Create an object to coordinate pipeline creation and execution.
Pipeline pipeline = new MRPipeline(WordCount.class, getConf());
// Reference a given text file as a collection of Strings.
PCollection<String> lines = pipeline.readTextFile(args[1]);
// Define a function that splits each line in a PCollection of Strings into a
// PCollection made up of the individual words in the file.
PCollection<String> words = lines.parallelDo(new DoFn<String, String>() {
public void process(String line, Emitter<String> emitter) {
for (String word : line.split("\\s+")) {
emitter.emit(word);
}
}
}, // Indicates the serialization format
Writables.strings());
// The count method applies a series of Crunch primitives and returns
// a map of the unique words in the input PCollection to their counts.
// Best of all, the count() function doesn't need to know anything about
// the kind of data stored in the input PCollection.
PTable<String, Long> counts = words.count();
// Instruct the pipeline to write the resulting counts to a text file.
pipeline.writeTextFile(counts, args[2]);
// Execute the pipeline as a MapReduce.
pipeline.done();
return 0;
}
use of org.apache.crunch.impl.mr.MRPipeline in project crunch by cloudera.
the class SpecificAvroGroupByTest method testGrouByWithSpecificAvroType.
@Test
public void testGrouByWithSpecificAvroType() throws Exception {
MRPipeline pipeline = new MRPipeline(SpecificAvroGroupByTest.class);
testSpecificAvro(pipeline);
}
use of org.apache.crunch.impl.mr.MRPipeline in project crunch by cloudera.
the class MapsideJoinTest method testMapsideJoin_RightSideIsEmpty.
@Test
public void testMapsideJoin_RightSideIsEmpty() throws IOException {
MRPipeline pipeline = new MRPipeline(MapsideJoinTest.class);
PTable<Integer, String> customerTable = readTable(pipeline, "customers.txt");
PTable<Integer, String> orderTable = readTable(pipeline, "orders.txt");
PTable<Integer, String> filteredOrderTable = orderTable.parallelDo(new NegativeFilter(), orderTable.getPTableType());
PTable<Integer, Pair<String, String>> joined = MapsideJoin.join(customerTable, filteredOrderTable);
List<Pair<Integer, Pair<String, String>>> materializedJoin = Lists.newArrayList(joined.materialize());
assertTrue(materializedJoin.isEmpty());
}
use of org.apache.crunch.impl.mr.MRPipeline in project crunch by cloudera.
the class MapsTest method run.
public static void run(PTypeFamily typeFamily) throws Exception {
Pipeline pipeline = new MRPipeline(MapsTest.class);
String shakesInputPath = FileHelper.createTempCopyOf("shakes.txt");
PCollection<String> shakespeare = pipeline.readTextFile(shakesInputPath);
Iterable<Pair<String, Map<String, Long>>> output = shakespeare.parallelDo(new DoFn<String, Pair<String, Map<String, Long>>>() {
@Override
public void process(String input, Emitter<Pair<String, Map<String, Long>>> emitter) {
String last = null;
for (String word : input.toLowerCase().split("\\W+")) {
if (!word.isEmpty()) {
String firstChar = word.substring(0, 1);
if (last != null) {
Map<String, Long> cc = ImmutableMap.of(firstChar, 1L);
emitter.emit(Pair.of(last, cc));
}
last = firstChar;
}
}
}
}, typeFamily.tableOf(typeFamily.strings(), typeFamily.maps(typeFamily.longs()))).groupByKey().combineValues(new CombineFn<String, Map<String, Long>>() {
@Override
public void process(Pair<String, Iterable<Map<String, Long>>> input, Emitter<Pair<String, Map<String, Long>>> emitter) {
Map<String, Long> agg = Maps.newHashMap();
for (Map<String, Long> in : input.second()) {
for (Map.Entry<String, Long> e : in.entrySet()) {
if (!agg.containsKey(e.getKey())) {
agg.put(e.getKey(), e.getValue());
} else {
agg.put(e.getKey(), e.getValue() + agg.get(e.getKey()));
}
}
}
emitter.emit(Pair.of(input.first(), agg));
}
}).materialize();
boolean passed = false;
for (Pair<String, Map<String, Long>> v : output) {
if (v.first() == "k" && v.second().get("n") == 8L) {
passed = true;
break;
}
}
pipeline.done();
}
Aggregations