use of org.apache.kafka.streams.kstream.KStream in project kafka-streams-examples by confluentinc.
the class EventDeduplicationLambdaIntegrationTest method shouldRemoveDuplicatesFromTheInput.
@Test
public void shouldRemoveDuplicatesFromTheInput() throws Exception {
// e.g. "4ff3cb44-abcb-46e3-8f9a-afb7cc74fbb8"
String firstId = UUID.randomUUID().toString();
String secondId = UUID.randomUUID().toString();
String thirdId = UUID.randomUUID().toString();
List<String> inputValues = Arrays.asList(firstId, secondId, firstId, firstId, secondId, thirdId, thirdId, firstId, secondId);
List<String> expectedValues = Arrays.asList(firstId, secondId, thirdId);
//
// Step 1: Configure and start the processor topology.
//
StreamsBuilder builder = new StreamsBuilder();
Properties streamsConfiguration = new Properties();
streamsConfiguration.put(StreamsConfig.APPLICATION_ID_CONFIG, "deduplication-lambda-integration-test");
streamsConfiguration.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers());
streamsConfiguration.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.ByteArray().getClass().getName());
streamsConfiguration.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass().getName());
// The commit interval for flushing records to state stores and downstream must be lower than
// this integration test's timeout (30 secs) to ensure we observe the expected processing results.
streamsConfiguration.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG, TimeUnit.SECONDS.toMillis(10));
streamsConfiguration.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
// Use a temporary directory for storing state, which will be automatically removed after the test.
streamsConfiguration.put(StreamsConfig.STATE_DIR_CONFIG, TestUtils.tempDirectory().getAbsolutePath());
// How long we "remember" an event. During this time, any incoming duplicates of the event
// will be, well, dropped, thereby de-duplicating the input data.
//
// The actual value depends on your use case. To reduce memory and disk usage, you could
// decrease the size to purge old windows more frequently at the cost of potentially missing out
// on de-duplicating late-arriving records.
long maintainDurationPerEventInMs = TimeUnit.MINUTES.toMillis(10);
// The number of segments has no impact on "correctness".
// Using more segments implies larger overhead but allows for more fined grained record expiration
// Note: the specified retention time is a _minimum_ time span and no strict upper time bound
int numberOfSegments = 3;
// retention period must be at least window size -- for this use case, we don't need a longer retention period
// and thus just use the window size as retention time
long retentionPeriod = maintainDurationPerEventInMs;
StoreBuilder<WindowStore<String, Long>> dedupStoreBuilder = Stores.windowStoreBuilder(Stores.persistentWindowStore(storeName, retentionPeriod, numberOfSegments, maintainDurationPerEventInMs, false), Serdes.String(), Serdes.Long());
builder.addStateStore(dedupStoreBuilder);
KStream<byte[], String> input = builder.stream(inputTopic);
KStream<byte[], String> deduplicated = input.transform(// function as needed.
() -> new DeduplicationTransformer<>(maintainDurationPerEventInMs, (key, value) -> value), storeName);
deduplicated.to(outputTopic);
KafkaStreams streams = new KafkaStreams(builder.build(), streamsConfiguration);
streams.start();
//
// Step 2: Produce some input data to the input topic.
//
Properties producerConfig = new Properties();
producerConfig.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers());
producerConfig.put(ProducerConfig.ACKS_CONFIG, "all");
producerConfig.put(ProducerConfig.RETRIES_CONFIG, 0);
producerConfig.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class);
producerConfig.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
IntegrationTestUtils.produceValuesSynchronously(inputTopic, inputValues, producerConfig);
//
// Step 3: Verify the application's output data.
//
Properties consumerConfig = new Properties();
consumerConfig.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers());
consumerConfig.put(ConsumerConfig.GROUP_ID_CONFIG, "deduplication-integration-test-standard-consumer");
consumerConfig.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
consumerConfig.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class);
consumerConfig.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
List<String> actualValues = IntegrationTestUtils.waitUntilMinValuesRecordsReceived(consumerConfig, outputTopic, expectedValues.size());
streams.close();
assertThat(actualValues).containsExactlyElementsOf(expectedValues);
}
use of org.apache.kafka.streams.kstream.KStream in project kafka-streams-examples by confluentinc.
the class HandlingCorruptedInputRecordsIntegrationTest method shouldIgnoreCorruptInputRecords.
@Test
public void shouldIgnoreCorruptInputRecords() throws Exception {
List<Long> inputValues = Arrays.asList(1L, 2L, 3L);
List<Long> expectedValues = inputValues.stream().map(x -> 2 * x).collect(Collectors.toList());
//
// Step 1: Configure and start the processor topology.
//
StreamsBuilder builder = new StreamsBuilder();
Properties streamsConfiguration = new Properties();
streamsConfiguration.put(StreamsConfig.APPLICATION_ID_CONFIG, "failure-handling-integration-test");
streamsConfiguration.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers());
streamsConfiguration.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.ByteArray().getClass().getName());
streamsConfiguration.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.ByteArray().getClass().getName());
streamsConfiguration.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
Serde<String> stringSerde = Serdes.String();
Serde<Long> longSerde = Serdes.Long();
KStream<byte[], byte[]> input = builder.stream(inputTopic);
// Note how the returned stream is of type `KStream<String, Long>`.
KStream<String, Long> doubled = input.flatMap((k, v) -> {
try {
// Attempt deserialization
String key = stringSerde.deserializer().deserialize("input-topic", k);
long value = longSerde.deserializer().deserialize("input-topic", v);
// checking.
return Collections.singletonList(KeyValue.pair(key, 2 * value));
} catch (SerializationException e) {
// Ignore/skip the corrupted record by catching the exception.
// Optionally, we can log the fact that we did so:
System.err.println("Could not deserialize record: " + e.getMessage());
}
return Collections.emptyList();
});
// Write the processing results (which was generated from valid records only) to Kafka.
doubled.to(outputTopic, Produced.with(stringSerde, longSerde));
KafkaStreams streams = new KafkaStreams(builder.build(), streamsConfiguration);
streams.start();
//
// Step 2: Produce some corrupt input data to the input topic.
//
Properties producerConfigForCorruptRecords = new Properties();
producerConfigForCorruptRecords.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers());
producerConfigForCorruptRecords.put(ProducerConfig.ACKS_CONFIG, "all");
producerConfigForCorruptRecords.put(ProducerConfig.RETRIES_CONFIG, 0);
producerConfigForCorruptRecords.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class);
producerConfigForCorruptRecords.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
IntegrationTestUtils.produceValuesSynchronously(inputTopic, Collections.singletonList("corrupt"), producerConfigForCorruptRecords);
//
// Step 3: Produce some (valid) input data to the input topic.
//
Properties producerConfig = new Properties();
producerConfig.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers());
producerConfig.put(ProducerConfig.ACKS_CONFIG, "all");
producerConfig.put(ProducerConfig.RETRIES_CONFIG, 0);
producerConfig.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class);
producerConfig.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, LongSerializer.class);
IntegrationTestUtils.produceValuesSynchronously(inputTopic, inputValues, producerConfig);
//
// Step 4: Verify the application's output data.
//
Properties consumerConfig = new Properties();
consumerConfig.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers());
consumerConfig.put(ConsumerConfig.GROUP_ID_CONFIG, "map-function-lambda-integration-test-standard-consumer");
consumerConfig.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
consumerConfig.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class);
consumerConfig.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, LongDeserializer.class);
List<Long> actualValues = IntegrationTestUtils.waitUntilMinValuesRecordsReceived(consumerConfig, outputTopic, expectedValues.size());
streams.close();
assertThat(actualValues).isEqualTo(expectedValues);
}
use of org.apache.kafka.streams.kstream.KStream in project kafka by apache.
the class KStreamMapTest method testMap.
@Test
public void testMap() {
final StreamsBuilder builder = new StreamsBuilder();
final String topicName = "topic";
final int[] expectedKeys = new int[] { 0, 1, 2, 3 };
final MockApiProcessorSupplier<String, Integer, Void, Void> supplier = new MockApiProcessorSupplier<>();
final KStream<Integer, String> stream = builder.stream(topicName, Consumed.with(Serdes.Integer(), Serdes.String()));
stream.map((key, value) -> KeyValue.pair(value, key)).process(supplier);
try (final TopologyTestDriver driver = new TopologyTestDriver(builder.build(), props)) {
for (final int expectedKey : expectedKeys) {
final TestInputTopic<Integer, String> inputTopic = driver.createInputTopic(topicName, new IntegerSerializer(), new StringSerializer(), Instant.ofEpochMilli(0L), Duration.ZERO);
inputTopic.pipeInput(expectedKey, "V" + expectedKey, 10L - expectedKey);
}
}
final KeyValueTimestamp[] expected = new KeyValueTimestamp[] { new KeyValueTimestamp<>("V0", 0, 10), new KeyValueTimestamp<>("V1", 1, 9), new KeyValueTimestamp<>("V2", 2, 8), new KeyValueTimestamp<>("V3", 3, 7) };
assertEquals(4, supplier.theCapturedProcessor().processed().size());
for (int i = 0; i < expected.length; i++) {
assertEquals(expected[i], supplier.theCapturedProcessor().processed().get(i));
}
}
use of org.apache.kafka.streams.kstream.KStream in project kafka by apache.
the class KStreamMapValuesTest method testMapValuesWithKeys.
@Test
public void testMapValuesWithKeys() {
final StreamsBuilder builder = new StreamsBuilder();
final ValueMapperWithKey<Integer, CharSequence, Integer> mapper = (readOnlyKey, value) -> value.length() + readOnlyKey;
final int[] expectedKeys = { 1, 10, 100, 1000 };
final KStream<Integer, String> stream = builder.stream(topicName, Consumed.with(Serdes.Integer(), Serdes.String()));
stream.mapValues(mapper).process(supplier);
try (final TopologyTestDriver driver = new TopologyTestDriver(builder.build(), props)) {
final TestInputTopic<Integer, String> inputTopic = driver.createInputTopic(topicName, new IntegerSerializer(), new StringSerializer());
for (final int expectedKey : expectedKeys) {
inputTopic.pipeInput(expectedKey, Integer.toString(expectedKey), expectedKey / 2L);
}
}
final KeyValueTimestamp[] expected = { new KeyValueTimestamp<>(1, 2, 0), new KeyValueTimestamp<>(10, 12, 5), new KeyValueTimestamp<>(100, 103, 50), new KeyValueTimestamp<>(1000, 1004, 500) };
assertArrayEquals(expected, supplier.theCapturedProcessor().processed().toArray());
}
use of org.apache.kafka.streams.kstream.KStream in project kafka by apache.
the class CogroupedKStreamImplTest method shouldInsertRepartitionsTopicForCogroupsUsedTwice.
@Test
public void shouldInsertRepartitionsTopicForCogroupsUsedTwice() {
final StreamsBuilder builder = new StreamsBuilder();
final Properties properties = new Properties();
final KStream<String, String> stream1 = builder.stream("one", stringConsumed);
final KGroupedStream<String, String> groupedOne = stream1.map((k, v) -> new KeyValue<>(v, k)).groupByKey(Grouped.as("foo"));
final CogroupedKStream<String, String> one = groupedOne.cogroup(STRING_AGGREGATOR);
one.aggregate(STRING_INITIALIZER);
one.aggregate(STRING_INITIALIZER);
final String topologyDescription = builder.build(properties).describe().toString();
assertThat(topologyDescription, equalTo("Topologies:\n" + " Sub-topology: 0\n" + " Source: KSTREAM-SOURCE-0000000000 (topics: [one])\n" + " --> KSTREAM-MAP-0000000001\n" + " Processor: KSTREAM-MAP-0000000001 (stores: [])\n" + " --> foo-repartition-filter\n" + " <-- KSTREAM-SOURCE-0000000000\n" + " Processor: foo-repartition-filter (stores: [])\n" + " --> foo-repartition-sink\n" + " <-- KSTREAM-MAP-0000000001\n" + " Sink: foo-repartition-sink (topic: foo-repartition)\n" + " <-- foo-repartition-filter\n\n" + " Sub-topology: 1\n" + " Source: foo-repartition-source (topics: [foo-repartition])\n" + " --> COGROUPKSTREAM-AGGREGATE-0000000006, COGROUPKSTREAM-AGGREGATE-0000000012\n" + " Processor: COGROUPKSTREAM-AGGREGATE-0000000006 (stores: [COGROUPKSTREAM-AGGREGATE-STATE-STORE-0000000002])\n" + " --> COGROUPKSTREAM-MERGE-0000000007\n" + " <-- foo-repartition-source\n" + " Processor: COGROUPKSTREAM-AGGREGATE-0000000012 (stores: [COGROUPKSTREAM-AGGREGATE-STATE-STORE-0000000008])\n" + " --> COGROUPKSTREAM-MERGE-0000000013\n" + " <-- foo-repartition-source\n" + " Processor: COGROUPKSTREAM-MERGE-0000000007 (stores: [])\n" + " --> none\n" + " <-- COGROUPKSTREAM-AGGREGATE-0000000006\n" + " Processor: COGROUPKSTREAM-MERGE-0000000013 (stores: [])\n" + " --> none\n" + " <-- COGROUPKSTREAM-AGGREGATE-0000000012\n\n"));
}
Aggregations