Search in sources :

Example 1 with KafkaIO

use of org.apache.beam.sdk.io.kafka.KafkaIO in project beam by apache.

the class ResumeFromCheckpointStreamingTest method run.

private SparkPipelineResult run(Optional<Instant> stopWatermarkOption, int expectedAssertions) {
    KafkaIO.Read<String, Instant> read = KafkaIO.<String, Instant>read().withBootstrapServers(EMBEDDED_KAFKA_CLUSTER.getBrokerList()).withTopics(Collections.singletonList(TOPIC)).withKeyDeserializer(StringDeserializer.class).withValueDeserializer(InstantDeserializer.class).withConsumerConfigUpdates(ImmutableMap.of("auto.offset.reset", "earliest")).withTimestampFn(KV::getValue).withWatermarkFn(kv -> {
        // at EOF move WM to infinity.
        String key = kv.getKey();
        Instant instant = kv.getValue();
        return "EOF".equals(key) ? BoundedWindow.TIMESTAMP_MAX_VALUE : instant;
    });
    TestSparkPipelineOptions options = PipelineOptionsFactory.create().as(TestSparkPipelineOptions.class);
    options.setSparkMaster("local[*]");
    options.setCheckpointDurationMillis(options.getBatchIntervalMillis());
    options.setExpectedAssertions(expectedAssertions);
    options.setRunner(TestSparkRunner.class);
    options.setEnableSparkMetricSinks(false);
    options.setForceStreaming(true);
    options.setCheckpointDir(temporaryFolder.getRoot().getPath());
    // timeout is per execution so it can be injected by the caller.
    if (stopWatermarkOption.isPresent()) {
        options.setStopPipelineWatermark(stopWatermarkOption.get().getMillis());
    }
    Pipeline p = Pipeline.create(options);
    PCollection<String> expectedCol = p.apply(Create.of(ImmutableList.of("side1", "side2")).withCoder(StringUtf8Coder.of()));
    PCollectionView<List<String>> view = expectedCol.apply(View.asList());
    PCollection<KV<String, Instant>> kafkaStream = p.apply(read.withoutMetadata());
    PCollection<Iterable<String>> grouped = kafkaStream.apply(Keys.create()).apply("EOFShallNotPassFn", ParDo.of(new EOFShallNotPassFn(view)).withSideInputs(view)).apply(Window.<String>into(FixedWindows.of(Duration.millis(500))).triggering(AfterWatermark.pastEndOfWindow()).accumulatingFiredPanes().withAllowedLateness(Duration.ZERO)).apply(WithKeys.of(1)).apply(GroupByKey.create()).apply(Values.create());
    grouped.apply(new PAssertWithoutFlatten<>("k1", "k2", "k3", "k4", "k5"));
    return (SparkPipelineResult) p.run();
}
Also used : KafkaIO(org.apache.beam.sdk.io.kafka.KafkaIO) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) SparkPipelineResult(org.apache.beam.runners.spark.SparkPipelineResult) InstantDeserializer(org.apache.beam.sdk.io.kafka.serialization.InstantDeserializer) TestSparkPipelineOptions(org.apache.beam.runners.spark.TestSparkPipelineOptions) List(java.util.List) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList)

Example 2 with KafkaIO

use of org.apache.beam.sdk.io.kafka.KafkaIO in project beam by apache.

the class NexmarkLauncher method sourceEventsFromKafka.

/**
 * Return source of events from Kafka.
 */
private PCollection<Event> sourceEventsFromKafka(Pipeline p, final Instant start) {
    checkArgument((options.getBootstrapServers() != null), "Missing --bootstrapServers");
    NexmarkUtils.console("Reading events from Kafka Topic %s", options.getKafkaTopic());
    KafkaIO.Read<byte[], byte[]> read = KafkaIO.<byte[], byte[]>read().withBootstrapServers(options.getBootstrapServers()).withKeyDeserializer(ByteArrayDeserializer.class).withValueDeserializer(ByteArrayDeserializer.class).withStartReadTime(start).withMaxNumRecords(options.getNumEvents() != null ? options.getNumEvents() : Long.MAX_VALUE);
    if (options.getKafkaTopicCreateTimeMaxDelaySec() >= 0) {
        read = read.withCreateTime(Duration.standardSeconds(options.getKafkaTopicCreateTimeMaxDelaySec()));
    }
    if (options.getNumKafkaTopicPartitions() > 0) {
        ArrayList<TopicPartition> partitionArrayList = new ArrayList<>();
        for (int i = 0; i < options.getNumKafkaTopicPartitions(); ++i) {
            partitionArrayList.add(new TopicPartition(options.getKafkaTopic(), i));
        }
        read = read.withTopicPartitions(partitionArrayList);
    } else {
        read = read.withTopic(options.getKafkaTopic());
    }
    return p.apply(queryName + ".ReadKafkaEvents", read.withoutMetadata()).apply(queryName + ".KafkaToEvents", ParDo.of(BYTEARRAY_TO_EVENT));
}
Also used : KafkaIO(org.apache.beam.sdk.io.kafka.KafkaIO) TopicPartition(org.apache.kafka.common.TopicPartition) ArrayList(java.util.ArrayList) ByteArrayDeserializer(org.apache.kafka.common.serialization.ByteArrayDeserializer)

Example 3 with KafkaIO

use of org.apache.beam.sdk.io.kafka.KafkaIO in project beam by apache.

the class SparkRunnerDebuggerTest method debugStreamingPipeline.

@Test
public void debugStreamingPipeline() {
    TestSparkPipelineOptions options = PipelineOptionsFactory.create().as(TestSparkPipelineOptions.class);
    options.setForceStreaming(true);
    options.setRunner(SparkRunnerDebugger.class);
    Pipeline pipeline = Pipeline.create(options);
    KafkaIO.Read<String, String> read = KafkaIO.<String, String>read().withBootstrapServers("mykafka:9092").withTopics(Collections.singletonList("my_input_topic")).withKeyDeserializer(StringDeserializer.class).withValueDeserializer(StringDeserializer.class);
    KafkaIO.Write<String, String> write = KafkaIO.<String, String>write().withBootstrapServers("myotherkafka:9092").withTopic("my_output_topic").withKeySerializer(StringSerializer.class).withValueSerializer(StringSerializer.class);
    KvCoder<String, String> stringKvCoder = KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of());
    pipeline.apply(read.withoutMetadata()).setCoder(stringKvCoder).apply(Window.into(FixedWindows.of(Duration.standardSeconds(5)))).apply(ParDo.of(new SparkRunnerDebuggerTest.FormatKVFn())).apply(Distinct.create()).apply(WithKeys.of(new SparkRunnerDebuggerTest.ArbitraryKeyFunction())).apply(write);
    final String expectedPipeline = "KafkaUtils.createDirectStream(...)\n" + "_.map(new org.apache.beam.sdk.transforms.windowing.FixedWindows())\n" + "_.mapPartitions(new org.apache.beam.runners.spark." + "SparkRunnerDebuggerTest$FormatKVFn())\n" + "_.mapPartitions(new org.apache.beam.sdk.transforms.Contextful())\n" + "_.groupByKey()\n" + "_.map(new org.apache.beam.sdk.transforms.Combine$IterableCombineFn())\n" + "_.mapPartitions(new org.apache.beam.sdk.transforms.Distinct$3())\n" + "_.mapPartitions(new org.apache.beam.sdk.transforms.Contextful())\n" + "_.<org.apache.beam.sdk.io.kafka.AutoValue_KafkaIO_Write>";
    SparkRunnerDebugger.DebugSparkPipelineResult result = (SparkRunnerDebugger.DebugSparkPipelineResult) pipeline.run();
    assertThat("Debug pipeline did not equal expected", result.getDebugString(), Matchers.equalTo(expectedPipeline));
}
Also used : KafkaIO(org.apache.beam.sdk.io.kafka.KafkaIO) StringDeserializer(org.apache.kafka.common.serialization.StringDeserializer) Pipeline(org.apache.beam.sdk.Pipeline) StringSerializer(org.apache.kafka.common.serialization.StringSerializer) Test(org.junit.Test)

Aggregations

KafkaIO (org.apache.beam.sdk.io.kafka.KafkaIO)3 Pipeline (org.apache.beam.sdk.Pipeline)2 ArrayList (java.util.ArrayList)1 List (java.util.List)1 SparkPipelineResult (org.apache.beam.runners.spark.SparkPipelineResult)1 TestSparkPipelineOptions (org.apache.beam.runners.spark.TestSparkPipelineOptions)1 InstantDeserializer (org.apache.beam.sdk.io.kafka.serialization.InstantDeserializer)1 KV (org.apache.beam.sdk.values.KV)1 ImmutableList (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList)1 TopicPartition (org.apache.kafka.common.TopicPartition)1 ByteArrayDeserializer (org.apache.kafka.common.serialization.ByteArrayDeserializer)1 StringDeserializer (org.apache.kafka.common.serialization.StringDeserializer)1 StringSerializer (org.apache.kafka.common.serialization.StringSerializer)1 Instant (org.joda.time.Instant)1 Test (org.junit.Test)1