use of org.apache.crunch.impl.mr.MRPipeline in project crunch by cloudera.
the class AverageBytesByIP method run.
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.err.println();
System.err.println("Two and only two arguments are accepted.");
System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output");
System.err.println();
GenericOptionsParser.printGenericCommandUsage(System.err);
return 1;
}
// Create an object to coordinate pipeline creation and execution.
Pipeline pipeline = new MRPipeline(AverageBytesByIP.class, getConf());
// Reference a given text file as a collection of Strings.
PCollection<String> lines = pipeline.readTextFile(args[0]);
// Combiner used for summing up response size and count
CombineFn<String, Pair<Long, Long>> stringPairOfLongsSumCombiner = CombineFn.pairAggregator(CombineFn.SUM_LONGS, CombineFn.SUM_LONGS);
// Table of (ip, sum(response size), count)
PTable<String, Pair<Long, Long>> remoteAddrResponseSize = lines.parallelDo(extractResponseSize, Writables.tableOf(Writables.strings(), Writables.pairs(Writables.longs(), Writables.longs()))).groupByKey().combineValues(stringPairOfLongsSumCombiner);
// Calculate average response size by ip address
PTable<String, Double> avgs = remoteAddrResponseSize.parallelDo(calulateAverage, Writables.tableOf(Writables.strings(), Writables.doubles()));
// write the result to a text file
pipeline.writeTextFile(avgs, args[1]);
// Execute the pipeline as a MapReduce.
pipeline.done();
return 0;
}
use of org.apache.crunch.impl.mr.MRPipeline in project crunch by cloudera.
the class TotalBytesByIP method run.
public int run(String[] args) throws Exception {
if (args.length != 2) {
System.err.println();
System.err.println("Two and only two arguments are accepted.");
System.err.println("Usage: " + this.getClass().getName() + " [generic options] input output");
System.err.println();
GenericOptionsParser.printGenericCommandUsage(System.err);
return 1;
}
// Create an object to coordinate pipeline creation and execution.
Pipeline pipeline = new MRPipeline(TotalBytesByIP.class, getConf());
// Reference a given text file as a collection of Strings.
PCollection<String> lines = pipeline.readTextFile(args[0]);
// Combiner used for summing up response size
CombineFn<String, Long> longSumCombiner = CombineFn.SUM_LONGS();
// Table of (ip, sum(response size))
PTable<String, Long> ipAddrResponseSize = lines.parallelDo(extractIPResponseSize, Writables.tableOf(Writables.strings(), Writables.longs())).groupByKey().combineValues(longSumCombiner);
pipeline.writeTextFile(ipAddrResponseSize, args[1]);
// Execute the pipeline as a MapReduce.
pipeline.done();
return 0;
}
use of org.apache.crunch.impl.mr.MRPipeline in project crunch by cloudera.
the class MultiAvroSchemaJoinTest method testJoin.
@Test
public void testJoin() throws Exception {
Pipeline p = new MRPipeline(MultiAvroSchemaJoinTest.class);
PCollection<Person> people = p.read(From.avroFile(personFile.getAbsolutePath(), records(Person.class)));
PCollection<Employee> employees = p.read(From.avroFile(employeeFile.getAbsolutePath(), records(Employee.class)));
Iterable<Pair<Person, Employee>> result = people.by(new NameFn<Person>(), strings()).join(employees.by(new NameFn<Employee>(), strings())).values().materialize();
List<Pair<Person, Employee>> v = Lists.newArrayList(result);
assertEquals(1, v.size());
assertEquals("Kate", v.get(0).first().getName().toString());
assertEquals("Kate", v.get(0).second().getName().toString());
}
use of org.apache.crunch.impl.mr.MRPipeline in project crunch by cloudera.
the class SpecificAvroGroupByTest method testGrouByOnSpecificAvroButReflectionDatumReader.
@Test
public void testGrouByOnSpecificAvroButReflectionDatumReader() throws Exception {
MRPipeline pipeline = new MRPipeline(SpecificAvroGroupByTest.class);
// https://issues.apache.org/jira/browse/AVRO-1046 resolves
// the ClassCastException when reading specific Avro types with
// ReflectDatumReader
pipeline.getConfiguration().setBoolean(AvroJob.MAP_OUTPUT_IS_REFLECT, true);
testSpecificAvro(pipeline);
}
Aggregations