Search in sources :

Example 1 with SparkPipelineResult

use of org.apache.beam.runners.spark.SparkPipelineResult in project beam by apache.

the class ResumeFromCheckpointStreamingTest method testWithResume.

@Test
@Category(UsesCheckpointRecovery.class)
public void testWithResume() throws Exception {
    // write to Kafka
    produce(ImmutableMap.of("k1", new Instant(100), "k2", new Instant(200), "k3", new Instant(300), "k4", new Instant(400)));
    MetricsFilter metricsFilter = MetricsFilter.builder().addNameFilter(MetricNameFilter.inNamespace(ResumeFromCheckpointStreamingTest.class)).build();
    // first run should expect EOT matching the last injected element.
    SparkPipelineResult res = run(Optional.of(new Instant(400)), 0);
    assertThat(res.metrics().queryMetrics(metricsFilter).counters(), hasItem(attemptedMetricsResult(ResumeFromCheckpointStreamingTest.class.getName(), "allMessages", "EOFShallNotPassFn", 4L)));
    assertThat(res.metrics().queryMetrics(metricsFilter).counters(), hasItem(attemptedMetricsResult(ResumeFromCheckpointStreamingTest.class.getName(), "processedMessages", "EOFShallNotPassFn", 4L)));
    //--- between executions:
    //- clear state.
    clean();
    //- write a bit more.
    produce(ImmutableMap.of("k5", new Instant(499), // to be dropped from [0, 500).
    "EOF", // to be dropped from [0, 500).
    new Instant(500)));
    // recovery should resume from last read offset, and read the second batch of input.
    res = runAgain(1);
    // assertions 2:
    assertThat(res.metrics().queryMetrics(metricsFilter).counters(), hasItem(attemptedMetricsResult(ResumeFromCheckpointStreamingTest.class.getName(), "processedMessages", "EOFShallNotPassFn", 5L)));
    assertThat(res.metrics().queryMetrics(metricsFilter).counters(), hasItem(attemptedMetricsResult(ResumeFromCheckpointStreamingTest.class.getName(), "allMessages", "EOFShallNotPassFn", 6L)));
    long successAssertions = 0;
    Iterable<MetricResult<Long>> counterResults = res.metrics().queryMetrics(MetricsFilter.builder().addNameFilter(MetricNameFilter.named(PAssertWithoutFlatten.class, PAssert.SUCCESS_COUNTER)).build()).counters();
    for (MetricResult<Long> counter : counterResults) {
        if (counter.attempted().longValue() > 0) {
            successAssertions++;
        }
    }
    assertThat(String.format("Expected %d successful assertions, but found %d.", 1L, successAssertions), successAssertions, is(1L));
    // validate assertion didn't fail.
    long failedAssertions = 0;
    Iterable<MetricResult<Long>> failCounterResults = res.metrics().queryMetrics(MetricsFilter.builder().addNameFilter(MetricNameFilter.named(PAssertWithoutFlatten.class, PAssert.FAILURE_COUNTER)).build()).counters();
    for (MetricResult<Long> counter : failCounterResults) {
        if (counter.attempted().longValue() > 0) {
            failedAssertions++;
        }
    }
    assertThat(String.format("Found %d failed assertions.", failedAssertions), failedAssertions, is(0L));
}
Also used : SparkPipelineResult(org.apache.beam.runners.spark.SparkPipelineResult) Instant(org.joda.time.Instant) MetricResult(org.apache.beam.sdk.metrics.MetricResult) MetricsFilter(org.apache.beam.sdk.metrics.MetricsFilter) Category(org.junit.experimental.categories.Category) Test(org.junit.Test)

Example 2 with SparkPipelineResult

use of org.apache.beam.runners.spark.SparkPipelineResult in project beam by apache.

the class ResumeFromCheckpointStreamingTest method run.

@SuppressWarnings("OptionalUsedAsFieldOrParameterType")
private SparkPipelineResult run(Optional<Instant> stopWatermarkOption, int expectedAssertions) {
    KafkaIO.Read<String, Instant> read = KafkaIO.<String, Instant>read().withBootstrapServers(EMBEDDED_KAFKA_CLUSTER.getBrokerList()).withTopics(Collections.singletonList(TOPIC)).withKeyDeserializer(StringDeserializer.class).withValueDeserializer(InstantDeserializer.class).updateConsumerProperties(ImmutableMap.<String, Object>of("auto.offset.reset", "earliest")).withTimestampFn(new SerializableFunction<KV<String, Instant>, Instant>() {

        @Override
        public Instant apply(KV<String, Instant> kv) {
            return kv.getValue();
        }
    }).withWatermarkFn(new SerializableFunction<KV<String, Instant>, Instant>() {

        @Override
        public Instant apply(KV<String, Instant> kv) {
            // at EOF move WM to infinity.
            String key = kv.getKey();
            Instant instant = kv.getValue();
            return key.equals("EOF") ? BoundedWindow.TIMESTAMP_MAX_VALUE : instant;
        }
    });
    TestSparkPipelineOptions options = PipelineOptionsFactory.create().as(TestSparkPipelineOptions.class);
    options.setSparkMaster("local[*]");
    options.setCheckpointDurationMillis(options.getBatchIntervalMillis());
    options.setExpectedAssertions(expectedAssertions);
    options.setRunner(TestSparkRunner.class);
    options.setEnableSparkMetricSinks(false);
    options.setForceStreaming(true);
    options.setCheckpointDir(temporaryFolder.getRoot().getPath());
    // timeout is per execution so it can be injected by the caller.
    if (stopWatermarkOption.isPresent()) {
        options.setStopPipelineWatermark(stopWatermarkOption.get().getMillis());
    }
    Pipeline p = Pipeline.create(options);
    PCollection<String> expectedCol = p.apply(Create.of(ImmutableList.of("side1", "side2")).withCoder(StringUtf8Coder.of()));
    PCollectionView<List<String>> view = expectedCol.apply(View.<String>asList());
    PCollection<KV<String, Instant>> kafkaStream = p.apply(read.withoutMetadata());
    PCollection<Iterable<String>> grouped = kafkaStream.apply(Keys.<String>create()).apply("EOFShallNotPassFn", ParDo.of(new EOFShallNotPassFn(view)).withSideInputs(view)).apply(Window.<String>into(FixedWindows.of(Duration.millis(500))).triggering(AfterWatermark.pastEndOfWindow()).accumulatingFiredPanes().withAllowedLateness(Duration.ZERO)).apply(WithKeys.<Integer, String>of(1)).apply(GroupByKey.<Integer, String>create()).apply(Values.<Iterable<String>>create());
    grouped.apply(new PAssertWithoutFlatten<>("k1", "k2", "k3", "k4", "k5"));
    return (SparkPipelineResult) p.run();
}
Also used : SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) KafkaIO(org.apache.beam.sdk.io.kafka.KafkaIO) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) SparkPipelineResult(org.apache.beam.runners.spark.SparkPipelineResult) InstantDeserializer(org.apache.beam.sdk.io.kafka.serialization.InstantDeserializer) TestSparkPipelineOptions(org.apache.beam.runners.spark.TestSparkPipelineOptions) List(java.util.List) ImmutableList(com.google.common.collect.ImmutableList)

Aggregations

SparkPipelineResult (org.apache.beam.runners.spark.SparkPipelineResult)2 Instant (org.joda.time.Instant)2 ImmutableList (com.google.common.collect.ImmutableList)1 List (java.util.List)1 TestSparkPipelineOptions (org.apache.beam.runners.spark.TestSparkPipelineOptions)1 Pipeline (org.apache.beam.sdk.Pipeline)1 KafkaIO (org.apache.beam.sdk.io.kafka.KafkaIO)1 InstantDeserializer (org.apache.beam.sdk.io.kafka.serialization.InstantDeserializer)1 MetricResult (org.apache.beam.sdk.metrics.MetricResult)1 MetricsFilter (org.apache.beam.sdk.metrics.MetricsFilter)1 SerializableFunction (org.apache.beam.sdk.transforms.SerializableFunction)1 KV (org.apache.beam.sdk.values.KV)1 Test (org.junit.Test)1 Category (org.junit.experimental.categories.Category)1