Search in sources :

Example 1 with Session

use of com.cloudera.cdk.examples.demo.event.Session in project cdk-examples by cloudera.

the class CreateSessions method run.

@Override
public int run(String[] args) throws Exception {
    // Construct a local filesystem dataset repository rooted at /tmp/data
    DatasetRepository fsRepo = DatasetRepositories.open("repo:hdfs:/tmp/data");
    // Construct an HCatalog dataset repository using external Hive tables
    DatasetRepository hcatRepo = DatasetRepositories.open("repo:hive:/tmp/data");
    // Turn debug on while in development.
    getPipeline().enableDebug();
    getPipeline().getConfiguration().set("crunch.log.job.progress", "true");
    // Load the events dataset and get the correct partition to sessionize
    Dataset<StandardEvent> eventsDataset = fsRepo.load("events");
    Dataset<StandardEvent> partition;
    if (args.length == 0 || (args.length == 1 && args[0].equals("LATEST"))) {
        partition = getLatestPartition(eventsDataset);
    } else {
        partition = getPartitionForURI(eventsDataset, args[0]);
    }
    // Create a parallel collection from the working partition
    PCollection<StandardEvent> events = read(CrunchDatasets.asSource(partition, StandardEvent.class));
    // Process the events into sessions, using a combiner
    PCollection<Session> sessions = events.parallelDo(new DoFn<StandardEvent, Session>() {

        @Override
        public void process(StandardEvent event, Emitter<Session> emitter) {
            emitter.emit(Session.newBuilder().setUserId(event.getUserId()).setSessionId(event.getSessionId()).setIp(event.getIp()).setStartTimestamp(event.getTimestamp()).setDuration(0).setSessionEventCount(1).build());
        }
    }, Avros.specifics(Session.class)).by(new MapFn<Session, Pair<Long, String>>() {

        @Override
        public Pair<Long, String> map(Session session) {
            return Pair.of(session.getUserId(), session.getSessionId());
        }
    }, Avros.pairs(Avros.longs(), Avros.strings())).groupByKey().combineValues(new CombineFn<Pair<Long, String>, Session>() {

        @Override
        public void process(Pair<Pair<Long, String>, Iterable<Session>> pairIterable, Emitter<Pair<Pair<Long, String>, Session>> emitter) {
            String ip = null;
            long startTimestamp = Long.MAX_VALUE;
            long endTimestamp = Long.MIN_VALUE;
            int sessionEventCount = 0;
            for (Session s : pairIterable.second()) {
                ip = s.getIp();
                startTimestamp = Math.min(startTimestamp, s.getStartTimestamp());
                endTimestamp = Math.max(endTimestamp, s.getStartTimestamp() + s.getDuration());
                sessionEventCount += s.getSessionEventCount();
            }
            emitter.emit(Pair.of(pairIterable.first(), Session.newBuilder().setUserId(pairIterable.first().first()).setSessionId(pairIterable.first().second()).setIp(ip).setStartTimestamp(startTimestamp).setDuration(endTimestamp - startTimestamp).setSessionEventCount(sessionEventCount).build()));
        }
    }).parallelDo(new DoFn<Pair<Pair<Long, String>, Session>, Session>() {

        @Override
        public void process(Pair<Pair<Long, String>, Session> pairSession, Emitter<Session> emitter) {
            emitter.emit(pairSession.second());
        }
    }, Avros.specifics(Session.class));
    // Write the sessions to the "sessions" Dataset
    getPipeline().write(sessions, CrunchDatasets.asTarget(hcatRepo.load("sessions")), Target.WriteMode.APPEND);
    return run().succeeded() ? 0 : 1;
}
Also used : Emitter(org.apache.crunch.Emitter) DatasetRepository(com.cloudera.cdk.data.DatasetRepository) FileSystemDatasetRepository(com.cloudera.cdk.data.filesystem.FileSystemDatasetRepository) MapFn(org.apache.crunch.MapFn) CombineFn(org.apache.crunch.CombineFn) StandardEvent(com.cloudera.cdk.data.event.StandardEvent) Session(com.cloudera.cdk.examples.demo.event.Session) Pair(org.apache.crunch.Pair)

Aggregations

DatasetRepository (com.cloudera.cdk.data.DatasetRepository)1 StandardEvent (com.cloudera.cdk.data.event.StandardEvent)1 FileSystemDatasetRepository (com.cloudera.cdk.data.filesystem.FileSystemDatasetRepository)1 Session (com.cloudera.cdk.examples.demo.event.Session)1 CombineFn (org.apache.crunch.CombineFn)1 Emitter (org.apache.crunch.Emitter)1 MapFn (org.apache.crunch.MapFn)1 Pair (org.apache.crunch.Pair)1