use of org.apache.beam.sdk.io.kafka.KafkaIO in project beam by apache.
the class ResumeFromCheckpointStreamingTest method run.
private SparkPipelineResult run(Optional<Instant> stopWatermarkOption, int expectedAssertions) {
KafkaIO.Read<String, Instant> read = KafkaIO.<String, Instant>read().withBootstrapServers(EMBEDDED_KAFKA_CLUSTER.getBrokerList()).withTopics(Collections.singletonList(TOPIC)).withKeyDeserializer(StringDeserializer.class).withValueDeserializer(InstantDeserializer.class).withConsumerConfigUpdates(ImmutableMap.of("auto.offset.reset", "earliest")).withTimestampFn(KV::getValue).withWatermarkFn(kv -> {
// at EOF move WM to infinity.
String key = kv.getKey();
Instant instant = kv.getValue();
return "EOF".equals(key) ? BoundedWindow.TIMESTAMP_MAX_VALUE : instant;
});
TestSparkPipelineOptions options = PipelineOptionsFactory.create().as(TestSparkPipelineOptions.class);
options.setSparkMaster("local[*]");
options.setCheckpointDurationMillis(options.getBatchIntervalMillis());
options.setExpectedAssertions(expectedAssertions);
options.setRunner(TestSparkRunner.class);
options.setEnableSparkMetricSinks(false);
options.setForceStreaming(true);
options.setCheckpointDir(temporaryFolder.getRoot().getPath());
// timeout is per execution so it can be injected by the caller.
if (stopWatermarkOption.isPresent()) {
options.setStopPipelineWatermark(stopWatermarkOption.get().getMillis());
}
Pipeline p = Pipeline.create(options);
PCollection<String> expectedCol = p.apply(Create.of(ImmutableList.of("side1", "side2")).withCoder(StringUtf8Coder.of()));
PCollectionView<List<String>> view = expectedCol.apply(View.asList());
PCollection<KV<String, Instant>> kafkaStream = p.apply(read.withoutMetadata());
PCollection<Iterable<String>> grouped = kafkaStream.apply(Keys.create()).apply("EOFShallNotPassFn", ParDo.of(new EOFShallNotPassFn(view)).withSideInputs(view)).apply(Window.<String>into(FixedWindows.of(Duration.millis(500))).triggering(AfterWatermark.pastEndOfWindow()).accumulatingFiredPanes().withAllowedLateness(Duration.ZERO)).apply(WithKeys.of(1)).apply(GroupByKey.create()).apply(Values.create());
grouped.apply(new PAssertWithoutFlatten<>("k1", "k2", "k3", "k4", "k5"));
return (SparkPipelineResult) p.run();
}
use of org.apache.beam.sdk.io.kafka.KafkaIO in project beam by apache.
the class NexmarkLauncher method sourceEventsFromKafka.
/**
* Return source of events from Kafka.
*/
private PCollection<Event> sourceEventsFromKafka(Pipeline p, final Instant start) {
checkArgument((options.getBootstrapServers() != null), "Missing --bootstrapServers");
NexmarkUtils.console("Reading events from Kafka Topic %s", options.getKafkaTopic());
KafkaIO.Read<byte[], byte[]> read = KafkaIO.<byte[], byte[]>read().withBootstrapServers(options.getBootstrapServers()).withKeyDeserializer(ByteArrayDeserializer.class).withValueDeserializer(ByteArrayDeserializer.class).withStartReadTime(start).withMaxNumRecords(options.getNumEvents() != null ? options.getNumEvents() : Long.MAX_VALUE);
if (options.getKafkaTopicCreateTimeMaxDelaySec() >= 0) {
read = read.withCreateTime(Duration.standardSeconds(options.getKafkaTopicCreateTimeMaxDelaySec()));
}
if (options.getNumKafkaTopicPartitions() > 0) {
ArrayList<TopicPartition> partitionArrayList = new ArrayList<>();
for (int i = 0; i < options.getNumKafkaTopicPartitions(); ++i) {
partitionArrayList.add(new TopicPartition(options.getKafkaTopic(), i));
}
read = read.withTopicPartitions(partitionArrayList);
} else {
read = read.withTopic(options.getKafkaTopic());
}
return p.apply(queryName + ".ReadKafkaEvents", read.withoutMetadata()).apply(queryName + ".KafkaToEvents", ParDo.of(BYTEARRAY_TO_EVENT));
}
use of org.apache.beam.sdk.io.kafka.KafkaIO in project beam by apache.
the class SparkRunnerDebuggerTest method debugStreamingPipeline.
@Test
public void debugStreamingPipeline() {
TestSparkPipelineOptions options = PipelineOptionsFactory.create().as(TestSparkPipelineOptions.class);
options.setForceStreaming(true);
options.setRunner(SparkRunnerDebugger.class);
Pipeline pipeline = Pipeline.create(options);
KafkaIO.Read<String, String> read = KafkaIO.<String, String>read().withBootstrapServers("mykafka:9092").withTopics(Collections.singletonList("my_input_topic")).withKeyDeserializer(StringDeserializer.class).withValueDeserializer(StringDeserializer.class);
KafkaIO.Write<String, String> write = KafkaIO.<String, String>write().withBootstrapServers("myotherkafka:9092").withTopic("my_output_topic").withKeySerializer(StringSerializer.class).withValueSerializer(StringSerializer.class);
KvCoder<String, String> stringKvCoder = KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of());
pipeline.apply(read.withoutMetadata()).setCoder(stringKvCoder).apply(Window.into(FixedWindows.of(Duration.standardSeconds(5)))).apply(ParDo.of(new SparkRunnerDebuggerTest.FormatKVFn())).apply(Distinct.create()).apply(WithKeys.of(new SparkRunnerDebuggerTest.ArbitraryKeyFunction())).apply(write);
final String expectedPipeline = "KafkaUtils.createDirectStream(...)\n" + "_.map(new org.apache.beam.sdk.transforms.windowing.FixedWindows())\n" + "_.mapPartitions(new org.apache.beam.runners.spark." + "SparkRunnerDebuggerTest$FormatKVFn())\n" + "_.mapPartitions(new org.apache.beam.sdk.transforms.Contextful())\n" + "_.groupByKey()\n" + "_.map(new org.apache.beam.sdk.transforms.Combine$IterableCombineFn())\n" + "_.mapPartitions(new org.apache.beam.sdk.transforms.Distinct$3())\n" + "_.mapPartitions(new org.apache.beam.sdk.transforms.Contextful())\n" + "_.<org.apache.beam.sdk.io.kafka.AutoValue_KafkaIO_Write>";
SparkRunnerDebugger.DebugSparkPipelineResult result = (SparkRunnerDebugger.DebugSparkPipelineResult) pipeline.run();
assertThat("Debug pipeline did not equal expected", result.getDebugString(), Matchers.equalTo(expectedPipeline));
}
Aggregations