use of org.apache.crunch.Pipeline in project crunch by cloudera.
the class AverageBytesByIP method run.
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.err.println();
System.err.println("Two and only two arguments are accepted.");
System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output");
System.err.println();
GenericOptionsParser.printGenericCommandUsage(System.err);
return 1;
}
// Create an object to coordinate pipeline creation and execution.
Pipeline pipeline = new MRPipeline(AverageBytesByIP.class, getConf());
// Reference a given text file as a collection of Strings.
PCollection<String> lines = pipeline.readTextFile(args[0]);
// Combiner used for summing up response size and count
CombineFn<String, Pair<Long, Long>> stringPairOfLongsSumCombiner = CombineFn.pairAggregator(CombineFn.SUM_LONGS, CombineFn.SUM_LONGS);
// Table of (ip, sum(response size), count)
PTable<String, Pair<Long, Long>> remoteAddrResponseSize = lines.parallelDo(extractResponseSize, Writables.tableOf(Writables.strings(), Writables.pairs(Writables.longs(), Writables.longs()))).groupByKey().combineValues(stringPairOfLongsSumCombiner);
// Calculate average response size by ip address
PTable<String, Double> avgs = remoteAddrResponseSize.parallelDo(calulateAverage, Writables.tableOf(Writables.strings(), Writables.doubles()));
// write the result to a text file
pipeline.writeTextFile(avgs, args[1]);
// Execute the pipeline as a MapReduce.
pipeline.done();
return 0;
}
use of org.apache.crunch.Pipeline in project crunch by cloudera.
the class TotalBytesByIP method run.
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.err.println();
System.err.println("Two and only two arguments are accepted.");
System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output");
System.err.println();
GenericOptionsParser.printGenericCommandUsage(System.err);
return 1;
}
// Create an object to coordinate pipeline creation and execution.
Pipeline pipeline = new MRPipeline(TotalBytesByIP.class, getConf());
// Reference a given text file as a collection of Strings.
PCollection<String> lines = pipeline.readTextFile(args[0]);
// Combiner used for summing up response size
CombineFn<String, Long> longSumCombiner = CombineFn.SUM_LONGS();
// Table of (ip, sum(response size))
PTable<String, Long> ipAddrResponseSize = lines.parallelDo(extractIPResponseSize, Writables.tableOf(Writables.strings(), Writables.longs())).groupByKey().combineValues(longSumCombiner);
pipeline.writeTextFile(ipAddrResponseSize, args[1]);
// Execute the pipeline as a MapReduce.
pipeline.done();
return 0;
}
use of org.apache.crunch.Pipeline in project crunch by cloudera.
the class MultiAvroSchemaJoinTest method testJoin.
@Test
public void testJoin() throws Exception {
Pipeline p = new MRPipeline(MultiAvroSchemaJoinTest.class);
PCollection<Person> people = p.read(From.avroFile(personFile.getAbsolutePath(), records(Person.class)));
PCollection<Employee> employees = p.read(From.avroFile(employeeFile.getAbsolutePath(), records(Employee.class)));
Iterable<Pair<Person, Employee>> result = people.by(new NameFn<Person>(), strings()).join(employees.by(new NameFn<Employee>(), strings())).values().materialize();
List<Pair<Person, Employee>> v = Lists.newArrayList(result);
assertEquals(1, v.size());
assertEquals("Kate", v.get(0).first().getName().toString());
assertEquals("Kate", v.get(0).second().getName().toString());
}
use of org.apache.crunch.Pipeline in project crunch by cloudera.
the class AggregateTest method testCollectValues_Avro.
@Test
public void testCollectValues_Avro() throws IOException {
MapStringToEmployeePair mapFn = new MapStringToEmployeePair();
Pipeline pipeline = new MRPipeline(AggregateTest.class);
Map<Integer, Collection<Employee>> collectionMap = pipeline.readTextFile(FileHelper.createTempCopyOf("set2.txt")).parallelDo(mapFn, Avros.tableOf(Avros.ints(), Avros.records(Employee.class))).collectValues().materializeToMap();
assertEquals(1, collectionMap.size());
Employee empC = mapFn.map("c").second();
Employee empD = mapFn.map("d").second();
Employee empA = mapFn.map("a").second();
assertEquals(Lists.newArrayList(empC, empD, empA), collectionMap.get(1));
}
use of org.apache.crunch.Pipeline in project crunch by cloudera.
the class AggregateTest method testAvro.
@Test
public void testAvro() throws Exception {
Pipeline pipeline = new MRPipeline(AggregateTest.class);
String shakesInputPath = FileHelper.createTempCopyOf("shakes.txt");
PCollection<String> shakes = pipeline.readTextFile(shakesInputPath);
runMinMax(shakes, AvroTypeFamily.getInstance());
pipeline.done();
}
Aggregations