Search in sources :

Example 1 with Emitter

use of org.apache.crunch.Emitter in project crunch by cloudera.

the class Aggregate method min.

/**
 * Returns the smallest numerical element from the input collection.
 */
public static <S> PCollection<S> min(PCollection<S> collect) {
    Class<S> clazz = collect.getPType().getTypeClass();
    if (!clazz.isPrimitive() && !Comparable.class.isAssignableFrom(clazz)) {
        throw new IllegalArgumentException("Can only get min for Comparable elements, not for: " + collect.getPType().getTypeClass());
    }
    PTypeFamily tf = collect.getTypeFamily();
    return PTables.values(collect.parallelDo("min", new DoFn<S, Pair<Boolean, S>>() {

        private transient S min = null;

        public void process(S input, Emitter<Pair<Boolean, S>> emitter) {
            if (min == null || ((Comparable<S>) min).compareTo(input) > 0) {
                min = input;
            }
        }

        public void cleanup(Emitter<Pair<Boolean, S>> emitter) {
            if (min != null) {
                emitter.emit(Pair.of(false, min));
            }
        }
    }, tf.tableOf(tf.booleans(), collect.getPType())).groupByKey().combineValues(new CombineFn<Boolean, S>() {

        public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
            S min = null;
            for (S v : input.second()) {
                if (min == null || ((Comparable<S>) min).compareTo(v) > 0) {
                    min = v;
                }
            }
            emitter.emit(Pair.of(input.first(), min));
        }
    }));
}
Also used : PTypeFamily(org.apache.crunch.types.PTypeFamily) Emitter(org.apache.crunch.Emitter) DoFn(org.apache.crunch.DoFn) CombineFn(org.apache.crunch.CombineFn) Pair(org.apache.crunch.Pair)

Example 2 with Emitter

use of org.apache.crunch.Emitter in project cdk-examples by cloudera.

the class CreateSessions method run.

@Override
public int run(String[] args) throws Exception {
    // Construct a local filesystem dataset repository rooted at /tmp/data
    DatasetRepository fsRepo = DatasetRepositories.open("repo:hdfs:/tmp/data");
    // Construct an HCatalog dataset repository using external Hive tables
    DatasetRepository hcatRepo = DatasetRepositories.open("repo:hive:/tmp/data");
    // Turn debug on while in development.
    getPipeline().enableDebug();
    getPipeline().getConfiguration().set("crunch.log.job.progress", "true");
    // Load the events dataset and get the correct partition to sessionize
    Dataset<StandardEvent> eventsDataset = fsRepo.load("events");
    Dataset<StandardEvent> partition;
    if (args.length == 0 || (args.length == 1 && args[0].equals("LATEST"))) {
        partition = getLatestPartition(eventsDataset);
    } else {
        partition = getPartitionForURI(eventsDataset, args[0]);
    }
    // Create a parallel collection from the working partition
    PCollection<StandardEvent> events = read(CrunchDatasets.asSource(partition, StandardEvent.class));
    // Process the events into sessions, using a combiner
    PCollection<Session> sessions = events.parallelDo(new DoFn<StandardEvent, Session>() {

        @Override
        public void process(StandardEvent event, Emitter<Session> emitter) {
            emitter.emit(Session.newBuilder().setUserId(event.getUserId()).setSessionId(event.getSessionId()).setIp(event.getIp()).setStartTimestamp(event.getTimestamp()).setDuration(0).setSessionEventCount(1).build());
        }
    }, Avros.specifics(Session.class)).by(new MapFn<Session, Pair<Long, String>>() {

        @Override
        public Pair<Long, String> map(Session session) {
            return Pair.of(session.getUserId(), session.getSessionId());
        }
    }, Avros.pairs(Avros.longs(), Avros.strings())).groupByKey().combineValues(new CombineFn<Pair<Long, String>, Session>() {

        @Override
        public void process(Pair<Pair<Long, String>, Iterable<Session>> pairIterable, Emitter<Pair<Pair<Long, String>, Session>> emitter) {
            String ip = null;
            long startTimestamp = Long.MAX_VALUE;
            long endTimestamp = Long.MIN_VALUE;
            int sessionEventCount = 0;
            for (Session s : pairIterable.second()) {
                ip = s.getIp();
                startTimestamp = Math.min(startTimestamp, s.getStartTimestamp());
                endTimestamp = Math.max(endTimestamp, s.getStartTimestamp() + s.getDuration());
                sessionEventCount += s.getSessionEventCount();
            }
            emitter.emit(Pair.of(pairIterable.first(), Session.newBuilder().setUserId(pairIterable.first().first()).setSessionId(pairIterable.first().second()).setIp(ip).setStartTimestamp(startTimestamp).setDuration(endTimestamp - startTimestamp).setSessionEventCount(sessionEventCount).build()));
        }
    }).parallelDo(new DoFn<Pair<Pair<Long, String>, Session>, Session>() {

        @Override
        public void process(Pair<Pair<Long, String>, Session> pairSession, Emitter<Session> emitter) {
            emitter.emit(pairSession.second());
        }
    }, Avros.specifics(Session.class));
    // Write the sessions to the "sessions" Dataset
    getPipeline().write(sessions, CrunchDatasets.asTarget(hcatRepo.load("sessions")), Target.WriteMode.APPEND);
    return run().succeeded() ? 0 : 1;
}
Also used : Emitter(org.apache.crunch.Emitter) DatasetRepository(com.cloudera.cdk.data.DatasetRepository) FileSystemDatasetRepository(com.cloudera.cdk.data.filesystem.FileSystemDatasetRepository) MapFn(org.apache.crunch.MapFn) CombineFn(org.apache.crunch.CombineFn) StandardEvent(com.cloudera.cdk.data.event.StandardEvent) Session(com.cloudera.cdk.examples.demo.event.Session) Pair(org.apache.crunch.Pair)

Example 3 with Emitter

use of org.apache.crunch.Emitter in project crunch by cloudera.

the class Aggregate method max.

/**
 * Returns the largest numerical element from the input collection.
 */
public static <S> PCollection<S> max(PCollection<S> collect) {
    Class<S> clazz = collect.getPType().getTypeClass();
    if (!clazz.isPrimitive() && !Comparable.class.isAssignableFrom(clazz)) {
        throw new IllegalArgumentException("Can only get max for Comparable elements, not for: " + collect.getPType().getTypeClass());
    }
    PTypeFamily tf = collect.getTypeFamily();
    return PTables.values(collect.parallelDo("max", new DoFn<S, Pair<Boolean, S>>() {

        private transient S max = null;

        public void process(S input, Emitter<Pair<Boolean, S>> emitter) {
            if (max == null || ((Comparable<S>) max).compareTo(input) < 0) {
                max = input;
            }
        }

        public void cleanup(Emitter<Pair<Boolean, S>> emitter) {
            if (max != null) {
                emitter.emit(Pair.of(true, max));
            }
        }
    }, tf.tableOf(tf.booleans(), collect.getPType())).groupByKey(1).combineValues(new CombineFn<Boolean, S>() {

        public void process(Pair<Boolean, Iterable<S>> input, Emitter<Pair<Boolean, S>> emitter) {
            S max = null;
            for (S v : input.second()) {
                if (max == null || ((Comparable<S>) max).compareTo(v) < 0) {
                    max = v;
                }
            }
            emitter.emit(Pair.of(input.first(), max));
        }
    }));
}
Also used : PTypeFamily(org.apache.crunch.types.PTypeFamily) Emitter(org.apache.crunch.Emitter) DoFn(org.apache.crunch.DoFn) CombineFn(org.apache.crunch.CombineFn) Pair(org.apache.crunch.Pair)

Aggregations

CombineFn (org.apache.crunch.CombineFn)3 Emitter (org.apache.crunch.Emitter)3 Pair (org.apache.crunch.Pair)3 DoFn (org.apache.crunch.DoFn)2 PTypeFamily (org.apache.crunch.types.PTypeFamily)2 DatasetRepository (com.cloudera.cdk.data.DatasetRepository)1 StandardEvent (com.cloudera.cdk.data.event.StandardEvent)1 FileSystemDatasetRepository (com.cloudera.cdk.data.filesystem.FileSystemDatasetRepository)1 Session (com.cloudera.cdk.examples.demo.event.Session)1 MapFn (org.apache.crunch.MapFn)1