use of org.apache.crunch.CombineFn in project crunch by cloudera.
the class Aggregate method min.
* Returns the smallest numerical element from the input collection.
public static <S> PCollection<S> min(PCollection<S> collect) {
Class<S> clazz = collect.getPType().getTypeClass();
if (!clazz.isPrimitive() && !Comparable.class.isAssignableFrom(clazz)) {
throw new IllegalArgumentException("Can only get min for Comparable elements, not for: " + collect.getPType().getTypeClass());
PTypeFamily tf = collect.getTypeFamily();
return PTables.values(collect.parallelDo("min", new DoFn<S, Pair<Boolean, S>>() {
private transient S min = null;
public void process(S input, Emitter<Pair<Boolean, S>> emitter) {
if (min == null || ((Comparable<S>) min).compareTo(input) > 0) {
min = input;
public void cleanup(Emitter<Pair<Boolean, S>> emitter) {
if (min != null) {
emitter.emit(Pair.of(false, min));
}, tf.tableOf(tf.booleans(), collect.getPType())).groupByKey().combineValues(new CombineFn<Boolean, S>() {
public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
S min = null;
for (S v : input.second()) {
if (min == null || ((Comparable<S>) min).compareTo(v) > 0) {
min = v;
emitter.emit(Pair.of(input.first(), min));
use of org.apache.crunch.CombineFn in project crunch by cloudera.
the class Aggregate method max.
* Returns the largest numerical element from the input collection.
public static <S> PCollection<S> max(PCollection<S> collect) {
Class<S> clazz = collect.getPType().getTypeClass();
if (!clazz.isPrimitive() && !Comparable.class.isAssignableFrom(clazz)) {
throw new IllegalArgumentException("Can only get max for Comparable elements, not for: " + collect.getPType().getTypeClass());
PTypeFamily tf = collect.getTypeFamily();
return PTables.values(collect.parallelDo("max", new DoFn<S, Pair<Boolean, S>>() {
private transient S max = null;
public void process(S input, Emitter<Pair<Boolean, S>> emitter) {
if (max == null || ((Comparable<S>) max).compareTo(input) < 0) {
max = input;
public void cleanup(Emitter<Pair<Boolean, S>> emitter) {
if (max != null) {
emitter.emit(Pair.of(true, max));
}, tf.tableOf(tf.booleans(), collect.getPType())).groupByKey(1).combineValues(new CombineFn<Boolean, S>() {
public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
S max = null;
for (S v : input.second()) {
if (max == null || ((Comparable<S>) max).compareTo(v) < 0) {
max = v;
emitter.emit(Pair.of(input.first(), max));
use of org.apache.crunch.CombineFn in project cdk-examples by cloudera.
the class CreateSessions method run.
public int run(String[] args) throws Exception {
// Construct a local filesystem dataset repository rooted at /tmp/data
DatasetRepository fsRepo ="repo:hdfs:/tmp/data");
// Construct an HCatalog dataset repository using external Hive tables
DatasetRepository hcatRepo ="repo:hive:/tmp/data");
// Turn debug on while in development.
getPipeline().getConfiguration().set("crunch.log.job.progress", "true");
// Load the events dataset and get the correct partition to sessionize
Dataset<StandardEvent> eventsDataset = fsRepo.load("events");
Dataset<StandardEvent> partition;
if (args.length == 0 || (args.length == 1 && args[0].equals("LATEST"))) {
partition = getLatestPartition(eventsDataset);
} else {
partition = getPartitionForURI(eventsDataset, args[0]);
// Create a parallel collection from the working partition
PCollection<StandardEvent> events = read(CrunchDatasets.asSource(partition, StandardEvent.class));
// Process the events into sessions, using a combiner
PCollection<Session> sessions = events.parallelDo(new DoFn<StandardEvent, Session>() {
public void process(StandardEvent event, Emitter<Session> emitter) {
}, Avros.specifics(Session.class)).by(new MapFn<Session, Pair<Long, String>>() {
public Pair<Long, String> map(Session session) {
return Pair.of(session.getUserId(), session.getSessionId());
}, Avros.pairs(Avros.longs(), Avros.strings())).groupByKey().combineValues(new CombineFn<Pair<Long, String>, Session>() {
public void process(Pair<Pair<Long, String>, Iterable<Session>> pairIterable, Emitter<Pair<Pair<Long, String>, Session>> emitter) {
String ip = null;
long startTimestamp = Long.MAX_VALUE;
long endTimestamp = Long.MIN_VALUE;
int sessionEventCount = 0;
for (Session s : pairIterable.second()) {
ip = s.getIp();
startTimestamp = Math.min(startTimestamp, s.getStartTimestamp());
endTimestamp = Math.max(endTimestamp, s.getStartTimestamp() + s.getDuration());
sessionEventCount += s.getSessionEventCount();
emitter.emit(Pair.of(pairIterable.first(), Session.newBuilder().setUserId(pairIterable.first().first()).setSessionId(pairIterable.first().second()).setIp(ip).setStartTimestamp(startTimestamp).setDuration(endTimestamp - startTimestamp).setSessionEventCount(sessionEventCount).build()));
}).parallelDo(new DoFn<Pair<Pair<Long, String>, Session>, Session>() {
public void process(Pair<Pair<Long, String>, Session> pairSession, Emitter<Session> emitter) {
}, Avros.specifics(Session.class));
// Write the sessions to the "sessions" Dataset
getPipeline().write(sessions, CrunchDatasets.asTarget(hcatRepo.load("sessions")), Target.WriteMode.APPEND);
return run().succeeded() ? 0 : 1;