use of org.apache.crunch.MapFn in project crunch by cloudera.
the class TextFileReaderFactory method read.
@Override
public Iterator<T> read(FileSystem fs, Path path) {
MapFn mapFn = null;
if (String.class.equals(ptype.getTypeClass())) {
mapFn = IdentityFn.getInstance();
} else {
// Check for a composite MapFn for the PType.
// Note that this won't work for Avro-- need to solve that.
MapFn input = ptype.getInputMapFn();
if (input instanceof CompositeMapFn) {
mapFn = ((CompositeMapFn) input).getSecond();
}
}
mapFn.setConfigurationForTest(conf);
mapFn.initialize();
FSDataInputStream is = null;
try {
is = fs.open(path);
} catch (IOException e) {
LOG.info("Could not read path: " + path, e);
return Iterators.emptyIterator();
}
final BufferedReader reader = new BufferedReader(new InputStreamReader(is));
final MapFn<String, T> iterMapFn = mapFn;
return new UnmodifiableIterator<T>() {
private String nextLine;
@Override
public boolean hasNext() {
try {
return (nextLine = reader.readLine()) != null;
} catch (IOException e) {
LOG.info("Exception reading text file stream", e);
return false;
}
}
@Override
public T next() {
return iterMapFn.map(nextLine);
}
};
}
use of org.apache.crunch.MapFn in project crunch by cloudera.
the class Writables method derived.
public static <S, T> PType<T> derived(Class<T> clazz, MapFn<S, T> inputFn, MapFn<T, S> outputFn, PType<S> base) {
WritableType<S, ?> wt = (WritableType<S, ?>) base;
MapFn input = new CompositeMapFn(wt.getInputMapFn(), inputFn);
MapFn output = new CompositeMapFn(outputFn, wt.getOutputMapFn());
return new WritableType(clazz, wt.getSerializationClass(), input, output, base.getSubTypes().toArray(new PType[0]));
}
use of org.apache.crunch.MapFn in project crunch by cloudera.
the class AvroTypeSortTest method testSortAvroTypesBySelectedFields.
@Test
public void testSortAvroTypesBySelectedFields() throws Exception {
MRPipeline pipeline = new MRPipeline(AvroTypeSortTest.class);
Person ccc10 = createPerson("CCC", 10);
Person bbb20 = createPerson("BBB", 20);
Person aaa30 = createPerson("AAA", 30);
writeAvroFile(Lists.newArrayList(ccc10, bbb20, aaa30), avroFile);
PCollection<Person> unsorted = pipeline.read(At.avroFile(avroFile.getAbsolutePath(), records(Person.class)));
// Sort by Name
MapFn<Person, String> nameExtractor = new MapFn<Person, String>() {
@Override
public String map(Person input) {
return input.getName().toString();
}
};
PCollection<Person> sortedByName = unsorted.by(nameExtractor, strings()).groupByKey().ungroup().values();
List<Person> sortedByNameList = Lists.newArrayList(sortedByName.materialize());
assertEquals(3, sortedByNameList.size());
assertEquals(aaa30, sortedByNameList.get(0));
assertEquals(bbb20, sortedByNameList.get(1));
assertEquals(ccc10, sortedByNameList.get(2));
// Sort by Age
MapFn<Person, Integer> ageExtractor = new MapFn<Person, Integer>() {
@Override
public Integer map(Person input) {
return input.getAge();
}
};
PCollection<Person> sortedByAge = unsorted.by(ageExtractor, ints()).groupByKey().ungroup().values();
List<Person> sortedByAgeList = Lists.newArrayList(sortedByAge.materialize());
assertEquals(3, sortedByAgeList.size());
assertEquals(ccc10, sortedByAgeList.get(0));
assertEquals(bbb20, sortedByAgeList.get(1));
assertEquals(aaa30, sortedByAgeList.get(2));
pipeline.done();
}
use of org.apache.crunch.MapFn in project cdk-examples by cloudera.
the class CreateSessions method run.
@Override
public int run(String[] args) throws Exception {
// Construct a local filesystem dataset repository rooted at /tmp/data
DatasetRepository fsRepo = DatasetRepositories.open("repo:hdfs:/tmp/data");
// Construct an HCatalog dataset repository using external Hive tables
DatasetRepository hcatRepo = DatasetRepositories.open("repo:hive:/tmp/data");
// Turn debug on while in development.
getPipeline().enableDebug();
getPipeline().getConfiguration().set("crunch.log.job.progress", "true");
// Load the events dataset and get the correct partition to sessionize
Dataset<StandardEvent> eventsDataset = fsRepo.load("events");
Dataset<StandardEvent> partition;
if (args.length == 0 || (args.length == 1 && args[0].equals("LATEST"))) {
partition = getLatestPartition(eventsDataset);
} else {
partition = getPartitionForURI(eventsDataset, args[0]);
}
// Create a parallel collection from the working partition
PCollection<StandardEvent> events = read(CrunchDatasets.asSource(partition, StandardEvent.class));
// Process the events into sessions, using a combiner
PCollection<Session> sessions = events.parallelDo(new DoFn<StandardEvent, Session>() {
@Override
public void process(StandardEvent event, Emitter<Session> emitter) {
emitter.emit(Session.newBuilder().setUserId(event.getUserId()).setSessionId(event.getSessionId()).setIp(event.getIp()).setStartTimestamp(event.getTimestamp()).setDuration(0).setSessionEventCount(1).build());
}
}, Avros.specifics(Session.class)).by(new MapFn<Session, Pair<Long, String>>() {
@Override
public Pair<Long, String> map(Session session) {
return Pair.of(session.getUserId(), session.getSessionId());
}
}, Avros.pairs(Avros.longs(), Avros.strings())).groupByKey().combineValues(new CombineFn<Pair<Long, String>, Session>() {
@Override
public void process(Pair<Pair<Long, String>, Iterable<Session>> pairIterable, Emitter<Pair<Pair<Long, String>, Session>> emitter) {
String ip = null;
long startTimestamp = Long.MAX_VALUE;
long endTimestamp = Long.MIN_VALUE;
int sessionEventCount = 0;
for (Session s : pairIterable.second()) {
ip = s.getIp();
startTimestamp = Math.min(startTimestamp, s.getStartTimestamp());
endTimestamp = Math.max(endTimestamp, s.getStartTimestamp() + s.getDuration());
sessionEventCount += s.getSessionEventCount();
}
emitter.emit(Pair.of(pairIterable.first(), Session.newBuilder().setUserId(pairIterable.first().first()).setSessionId(pairIterable.first().second()).setIp(ip).setStartTimestamp(startTimestamp).setDuration(endTimestamp - startTimestamp).setSessionEventCount(sessionEventCount).build()));
}
}).parallelDo(new DoFn<Pair<Pair<Long, String>, Session>, Session>() {
@Override
public void process(Pair<Pair<Long, String>, Session> pairSession, Emitter<Session> emitter) {
emitter.emit(pairSession.second());
}
}, Avros.specifics(Session.class));
// Write the sessions to the "sessions" Dataset
getPipeline().write(sessions, CrunchDatasets.asTarget(hcatRepo.load("sessions")), Target.WriteMode.APPEND);
return run().succeeded() ? 0 : 1;
}
Aggregations