Search in sources :

Example 16 with Event

use of org.apache.beam.sdk.nexmark.model.Event in project beam by apache.

the class Query0 method expand.

@Override
public PCollection<Event> expand(PCollection<Event> events) {
    final Coder<Event> coder = events.getCoder();
    return events.apply(name + ".Serialize", ParDo.of(new DoFn<Event, Event>() {

        private final Counter bytesMetric = Metrics.counter(name, "bytes");

        @ProcessElement
        public void processElement(ProcessContext c) throws CoderException, IOException {
            ByteArrayOutputStream outStream = new ByteArrayOutputStream();
            coder.encode(c.element(), outStream, Coder.Context.OUTER);
            byte[] byteArray = outStream.toByteArray();
            bytesMetric.inc((long) byteArray.length);
            ByteArrayInputStream inStream = new ByteArrayInputStream(byteArray);
            Event event = coder.decode(inStream, Coder.Context.OUTER);
            c.output(event);
        }
    }));
}
Also used : DoFn(org.apache.beam.sdk.transforms.DoFn) Counter(org.apache.beam.sdk.metrics.Counter) ByteArrayInputStream(java.io.ByteArrayInputStream) Event(org.apache.beam.sdk.nexmark.model.Event) ByteArrayOutputStream(java.io.ByteArrayOutputStream)

Example 17 with Event

use of org.apache.beam.sdk.nexmark.model.Event in project beam by apache.

the class Query10 method expand.

@Override
public PCollection<Done> expand(PCollection<Event> events) {
    final int numLogShards = maxNumWorkers * NUM_SHARDS_PER_WORKER;
    return events.apply(name + ".ShardEvents", ParDo.of(new DoFn<Event, KV<String, Event>>() {

        private final Counter lateCounter = Metrics.counter(name, "actuallyLateEvent");

        private final Counter onTimeCounter = Metrics.counter(name, "onTimeCounter");

        @ProcessElement
        public void processElement(ProcessContext c) {
            if (c.element().hasAnnotation("LATE")) {
                lateCounter.inc();
                LOG.debug("Observed late: {}", c.element());
            } else {
                onTimeCounter.inc();
            }
            int shardNum = (int) Math.abs((long) c.element().hashCode() % numLogShards);
            String shard = String.format("shard-%05d-of-%05d", shardNum, numLogShards);
            c.output(KV.of(shard, c.element()));
        }
    })).apply(name + ".WindowEvents", Window.<KV<String, Event>>into(FixedWindows.of(Duration.standardSeconds(configuration.windowSizeSec))).triggering(AfterEach.inOrder(Repeatedly.forever(AfterPane.elementCountAtLeast(configuration.maxLogEvents)).orFinally(AfterWatermark.pastEndOfWindow()), Repeatedly.forever(AfterFirst.of(AfterPane.elementCountAtLeast(configuration.maxLogEvents), AfterProcessingTime.pastFirstElementInPane().plusDelayOf(LATE_BATCHING_PERIOD))))).discardingFiredPanes().withAllowedLateness(Duration.standardDays(1))).apply(name + ".GroupByKey", GroupByKey.create()).apply(name + ".CheckForLateEvents", ParDo.of(new DoFn<KV<String, Iterable<Event>>, KV<String, Iterable<Event>>>() {

        private final Counter earlyCounter = Metrics.counter(name, "earlyShard");

        private final Counter onTimeCounter = Metrics.counter(name, "onTimeShard");

        private final Counter lateCounter = Metrics.counter(name, "lateShard");

        private final Counter unexpectedLatePaneCounter = Metrics.counter(name, "ERROR_unexpectedLatePane");

        private final Counter unexpectedOnTimeElementCounter = Metrics.counter(name, "ERROR_unexpectedOnTimeElement");

        @ProcessElement
        public void processElement(ProcessContext c, BoundedWindow window) {
            int numLate = 0;
            int numOnTime = 0;
            for (Event event : c.element().getValue()) {
                if (event.hasAnnotation("LATE")) {
                    numLate++;
                } else {
                    numOnTime++;
                }
            }
            String shard = c.element().getKey();
            LOG.debug(String.format("%s with timestamp %s has %d actually late and %d on-time " + "elements in pane %s for window %s", shard, c.timestamp(), numLate, numOnTime, c.pane(), window.maxTimestamp()));
            if (c.pane().getTiming() == PaneInfo.Timing.LATE) {
                if (numLate == 0) {
                    LOG.error("ERROR! No late events in late pane for {}", shard);
                    unexpectedLatePaneCounter.inc();
                }
                if (numOnTime > 0) {
                    LOG.error("ERROR! Have {} on-time events in late pane for {}", numOnTime, shard);
                    unexpectedOnTimeElementCounter.inc();
                }
                lateCounter.inc();
            } else if (c.pane().getTiming() == PaneInfo.Timing.EARLY) {
                if (numOnTime + numLate < configuration.maxLogEvents) {
                    LOG.error("ERROR! Only have {} events in early pane for {}", numOnTime + numLate, shard);
                }
                earlyCounter.inc();
            } else {
                onTimeCounter.inc();
            }
            c.output(c.element());
        }
    })).apply(name + ".UploadEvents", ParDo.of(new DoFn<KV<String, Iterable<Event>>, KV<Void, OutputFile>>() {

        private final Counter savedFileCounter = Metrics.counter(name, "savedFile");

        private final Counter writtenRecordsCounter = Metrics.counter(name, "writtenRecords");

        @ProcessElement
        public void processElement(ProcessContext c, BoundedWindow window) throws IOException {
            String shard = c.element().getKey();
            GcsOptions options = c.getPipelineOptions().as(GcsOptions.class);
            OutputFile outputFile = outputFileFor(window, shard, c.pane());
            LOG.debug(String.format("Writing %s with record timestamp %s, window timestamp %s, pane %s", shard, c.timestamp(), window.maxTimestamp(), c.pane()));
            if (outputFile.filename != null) {
                LOG.info("Beginning write to '{}'", outputFile.filename);
                int n = 0;
                try (OutputStream output = Channels.newOutputStream(openWritableGcsFile(options, outputFile.filename))) {
                    for (Event event : c.element().getValue()) {
                        Event.CODER.encode(event, output, Coder.Context.OUTER);
                        writtenRecordsCounter.inc();
                        if (++n % 10000 == 0) {
                            LOG.info("So far written {} records to '{}'", n, outputFile.filename);
                        }
                    }
                }
                LOG.info("Written all {} records to '{}'", n, outputFile.filename);
            }
            savedFileCounter.inc();
            c.output(KV.of(null, outputFile));
        }
    })).apply(name + ".WindowLogFiles", Window.<KV<Void, OutputFile>>into(FixedWindows.of(Duration.standardSeconds(configuration.windowSizeSec))).triggering(DefaultTrigger.of()).withAllowedLateness(Duration.standardDays(1)).discardingFiredPanes()).apply(name + ".GroupByKey2", GroupByKey.create()).apply(name + ".Index", ParDo.of(new DoFn<KV<Void, Iterable<OutputFile>>, Done>() {

        private final Counter unexpectedLateCounter = Metrics.counter(name, "ERROR_unexpectedLate");

        private final Counter unexpectedEarlyCounter = Metrics.counter(name, "ERROR_unexpectedEarly");

        private final Counter unexpectedIndexCounter = Metrics.counter(name, "ERROR_unexpectedIndex");

        private final Counter finalizedCounter = Metrics.counter(name, "indexed");

        @ProcessElement
        public void processElement(ProcessContext c, BoundedWindow window) throws IOException {
            if (c.pane().getTiming() == Timing.LATE) {
                unexpectedLateCounter.inc();
                LOG.error("ERROR! Unexpected LATE pane: {}", c.pane());
            } else if (c.pane().getTiming() == Timing.EARLY) {
                unexpectedEarlyCounter.inc();
                LOG.error("ERROR! Unexpected EARLY pane: {}", c.pane());
            } else if (c.pane().getTiming() == Timing.ON_TIME && c.pane().getIndex() != 0) {
                unexpectedIndexCounter.inc();
                LOG.error("ERROR! Unexpected ON_TIME pane index: {}", c.pane());
            } else {
                GcsOptions options = c.getPipelineOptions().as(GcsOptions.class);
                LOG.debug("Index with record timestamp {}, window timestamp {}, pane {}", c.timestamp(), window.maxTimestamp(), c.pane());
                @Nullable String filename = indexPathFor(window);
                if (filename != null) {
                    LOG.info("Beginning write to '{}'", filename);
                    int n = 0;
                    try (OutputStream output = Channels.newOutputStream(openWritableGcsFile(options, filename))) {
                        for (OutputFile outputFile : c.element().getValue()) {
                            output.write(outputFile.toString().getBytes(StandardCharsets.UTF_8));
                            n++;
                        }
                    }
                    LOG.info("Written all {} lines to '{}'", n, filename);
                }
                c.output(new Done("written for timestamp " + window.maxTimestamp()));
                finalizedCounter.inc();
            }
        }
    }));
}
Also used : Done(org.apache.beam.sdk.nexmark.model.Done) OutputStream(java.io.OutputStream) KV(org.apache.beam.sdk.values.KV) IOException(java.io.IOException) Counter(org.apache.beam.sdk.metrics.Counter) DoFn(org.apache.beam.sdk.transforms.DoFn) Event(org.apache.beam.sdk.nexmark.model.Event) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) GcsOptions(org.apache.beam.sdk.extensions.gcp.options.GcsOptions) Nullable(org.checkerframework.checker.nullness.qual.Nullable)

Aggregations

Event (org.apache.beam.sdk.nexmark.model.Event)17 KV (org.apache.beam.sdk.values.KV)8 PCollection (org.apache.beam.sdk.values.PCollection)8 TimestampedValue (org.apache.beam.sdk.values.TimestampedValue)6 Random (java.util.Random)5 DoFn (org.apache.beam.sdk.transforms.DoFn)4 Test (org.junit.Test)4 PipelineResult (org.apache.beam.sdk.PipelineResult)3 Counter (org.apache.beam.sdk.metrics.Counter)3 NexmarkConfiguration (org.apache.beam.sdk.nexmark.NexmarkConfiguration)3 Bid (org.apache.beam.sdk.nexmark.model.Bid)3 Pipeline (org.apache.beam.sdk.Pipeline)2 ResourceId (org.apache.beam.sdk.io.fs.ResourceId)2 NameCityStateId (org.apache.beam.sdk.nexmark.model.NameCityStateId)2 SelectEvent (org.apache.beam.sdk.nexmark.model.sql.SelectEvent)2 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)2 Row (org.apache.beam.sdk.values.Row)2 TupleTag (org.apache.beam.sdk.values.TupleTag)2 Nullable (org.checkerframework.checker.nullness.qual.Nullable)2 Category (org.junit.experimental.categories.Category)2