use of org.apache.kafka.streams.state.Stores in project kafka-streams-examples by confluentinc.
the class EventDeduplicationLambdaIntegrationTest method shouldRemoveDuplicatesFromTheInput.
@Test
public void shouldRemoveDuplicatesFromTheInput() throws Exception {
// e.g. "4ff3cb44-abcb-46e3-8f9a-afb7cc74fbb8"
String firstId = UUID.randomUUID().toString();
String secondId = UUID.randomUUID().toString();
String thirdId = UUID.randomUUID().toString();
List<String> inputValues = Arrays.asList(firstId, secondId, firstId, firstId, secondId, thirdId, thirdId, firstId, secondId);
List<String> expectedValues = Arrays.asList(firstId, secondId, thirdId);
//
// Step 1: Configure and start the processor topology.
//
StreamsBuilder builder = new StreamsBuilder();
Properties streamsConfiguration = new Properties();
streamsConfiguration.put(StreamsConfig.APPLICATION_ID_CONFIG, "deduplication-lambda-integration-test");
streamsConfiguration.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers());
streamsConfiguration.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.ByteArray().getClass().getName());
streamsConfiguration.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass().getName());
// The commit interval for flushing records to state stores and downstream must be lower than
// this integration test's timeout (30 secs) to ensure we observe the expected processing results.
streamsConfiguration.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG, TimeUnit.SECONDS.toMillis(10));
streamsConfiguration.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
// Use a temporary directory for storing state, which will be automatically removed after the test.
streamsConfiguration.put(StreamsConfig.STATE_DIR_CONFIG, TestUtils.tempDirectory().getAbsolutePath());
// How long we "remember" an event. During this time, any incoming duplicates of the event
// will be, well, dropped, thereby de-duplicating the input data.
//
// The actual value depends on your use case. To reduce memory and disk usage, you could
// decrease the size to purge old windows more frequently at the cost of potentially missing out
// on de-duplicating late-arriving records.
long maintainDurationPerEventInMs = TimeUnit.MINUTES.toMillis(10);
// The number of segments has no impact on "correctness".
// Using more segments implies larger overhead but allows for more fined grained record expiration
// Note: the specified retention time is a _minimum_ time span and no strict upper time bound
int numberOfSegments = 3;
// retention period must be at least window size -- for this use case, we don't need a longer retention period
// and thus just use the window size as retention time
long retentionPeriod = maintainDurationPerEventInMs;
StoreBuilder<WindowStore<String, Long>> dedupStoreBuilder = Stores.windowStoreBuilder(Stores.persistentWindowStore(storeName, retentionPeriod, numberOfSegments, maintainDurationPerEventInMs, false), Serdes.String(), Serdes.Long());
builder.addStateStore(dedupStoreBuilder);
KStream<byte[], String> input = builder.stream(inputTopic);
KStream<byte[], String> deduplicated = input.transform(// function as needed.
() -> new DeduplicationTransformer<>(maintainDurationPerEventInMs, (key, value) -> value), storeName);
deduplicated.to(outputTopic);
KafkaStreams streams = new KafkaStreams(builder.build(), streamsConfiguration);
streams.start();
//
// Step 2: Produce some input data to the input topic.
//
Properties producerConfig = new Properties();
producerConfig.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers());
producerConfig.put(ProducerConfig.ACKS_CONFIG, "all");
producerConfig.put(ProducerConfig.RETRIES_CONFIG, 0);
producerConfig.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class);
producerConfig.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
IntegrationTestUtils.produceValuesSynchronously(inputTopic, inputValues, producerConfig);
//
// Step 3: Verify the application's output data.
//
Properties consumerConfig = new Properties();
consumerConfig.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers());
consumerConfig.put(ConsumerConfig.GROUP_ID_CONFIG, "deduplication-integration-test-standard-consumer");
consumerConfig.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
consumerConfig.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class);
consumerConfig.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
List<String> actualValues = IntegrationTestUtils.waitUntilMinValuesRecordsReceived(consumerConfig, outputTopic, expectedValues.size());
streams.close();
assertThat(actualValues).containsExactlyElementsOf(expectedValues);
}
use of org.apache.kafka.streams.state.Stores in project kafka by apache.
the class KStreamImplTest method shouldSupportKeyChangeKTableFromKStream.
@Test
public void shouldSupportKeyChangeKTableFromKStream() {
final Consumed<String, String> consumed = Consumed.with(Serdes.String(), Serdes.String());
final StreamsBuilder builder = new StreamsBuilder();
final String input = "input";
final String output = "output";
builder.stream(input, consumed).map((key, value) -> new KeyValue<>(key.charAt(0) - 'A', value)).toTable(Materialized.with(Serdes.Integer(), null)).toStream().to(output);
final Topology topology = builder.build();
final String topologyDescription = topology.describe().toString();
assertThat(topologyDescription, equalTo("Topologies:\n" + " Sub-topology: 0\n" + " Source: KSTREAM-SOURCE-0000000000 (topics: [input])\n" + " --> KSTREAM-MAP-0000000001\n" + " Processor: KSTREAM-MAP-0000000001 (stores: [])\n" + " --> KSTREAM-FILTER-0000000005\n" + " <-- KSTREAM-SOURCE-0000000000\n" + " Processor: KSTREAM-FILTER-0000000005 (stores: [])\n" + " --> KSTREAM-SINK-0000000004\n" + " <-- KSTREAM-MAP-0000000001\n" + " Sink: KSTREAM-SINK-0000000004 (topic: KSTREAM-TOTABLE-0000000002-repartition)\n" + " <-- KSTREAM-FILTER-0000000005\n" + "\n" + " Sub-topology: 1\n" + " Source: KSTREAM-SOURCE-0000000006 (topics: [KSTREAM-TOTABLE-0000000002-repartition])\n" + " --> KSTREAM-TOTABLE-0000000002\n" + " Processor: KSTREAM-TOTABLE-0000000002 (stores: [])\n" + " --> KTABLE-TOSTREAM-0000000007\n" + " <-- KSTREAM-SOURCE-0000000006\n" + " Processor: KTABLE-TOSTREAM-0000000007 (stores: [])\n" + " --> KSTREAM-SINK-0000000008\n" + " <-- KSTREAM-TOTABLE-0000000002\n" + " Sink: KSTREAM-SINK-0000000008 (topic: output)\n" + " <-- KTABLE-TOSTREAM-0000000007\n\n"));
try (final TopologyTestDriver driver = new TopologyTestDriver(topology, props)) {
final TestInputTopic<String, String> inputTopic = driver.createInputTopic(input, Serdes.String().serializer(), Serdes.String().serializer());
final TestOutputTopic<Integer, String> outputTopic = driver.createOutputTopic(output, Serdes.Integer().deserializer(), Serdes.String().deserializer());
inputTopic.pipeInput("A", "01", 5L);
inputTopic.pipeInput("B", "02", 100L);
inputTopic.pipeInput("C", "03", 0L);
inputTopic.pipeInput("D", "04", 0L);
inputTopic.pipeInput("A", "05", 10L);
inputTopic.pipeInput("A", "06", 8L);
final List<TestRecord<Integer, String>> outputExpectRecords = new ArrayList<>();
outputExpectRecords.add(new TestRecord<>(0, "01", Instant.ofEpochMilli(5L)));
outputExpectRecords.add(new TestRecord<>(1, "02", Instant.ofEpochMilli(100L)));
outputExpectRecords.add(new TestRecord<>(2, "03", Instant.ofEpochMilli(0L)));
outputExpectRecords.add(new TestRecord<>(3, "04", Instant.ofEpochMilli(0L)));
outputExpectRecords.add(new TestRecord<>(0, "05", Instant.ofEpochMilli(10L)));
outputExpectRecords.add(new TestRecord<>(0, "06", Instant.ofEpochMilli(8L)));
assertEquals(outputTopic.readRecordsToList(), outputExpectRecords);
}
}
use of org.apache.kafka.streams.state.Stores in project kafka by apache.
the class RocksDBMetricsIntegrationTest method builderForStateStores.
private StreamsBuilder builderForStateStores() {
final StreamsBuilder builder = new StreamsBuilder();
// create two state stores, one non-segmented and one segmented
builder.table(STREAM_INPUT_ONE, Materialized.as(Stores.persistentKeyValueStore(MY_STORE_PERSISTENT_KEY_VALUE)).withCachingEnabled()).toStream().to(STREAM_OUTPUT_ONE);
builder.stream(STREAM_INPUT_TWO, Consumed.with(Serdes.Integer(), Serdes.String())).groupByKey().windowedBy(TimeWindows.of(WINDOW_SIZE).grace(Duration.ZERO)).aggregate(() -> 0L, (aggKey, newValue, aggValue) -> aggValue, Materialized.<Integer, Long, WindowStore<Bytes, byte[]>>as("time-windowed-aggregated-stream-store").withValueSerde(Serdes.Long()).withRetention(WINDOW_SIZE)).toStream().map((key, value) -> KeyValue.pair(value, value)).to(STREAM_OUTPUT_TWO, Produced.with(Serdes.Long(), Serdes.Long()));
return builder;
}
use of org.apache.kafka.streams.state.Stores in project kafka by apache.
the class KStreamImplTest method shouldSupportForeignKeyTableTableJoinWithKTableFromKStream.
@Test
public void shouldSupportForeignKeyTableTableJoinWithKTableFromKStream() {
final Consumed<String, String> consumed = Consumed.with(Serdes.String(), Serdes.String());
final StreamsBuilder builder = new StreamsBuilder();
final String input1 = "input1";
final String input2 = "input2";
final String output = "output";
final KTable<String, String> leftTable = builder.stream(input1, consumed).toTable();
final KTable<String, String> rightTable = builder.stream(input2, consumed).toTable();
final Function<String, String> extractor = value -> value.split("\\|")[1];
final ValueJoiner<String, String, String> joiner = (value1, value2) -> "(" + value1 + "," + value2 + ")";
leftTable.join(rightTable, extractor, joiner).toStream().to(output);
final Topology topology = builder.build(props);
final String topologyDescription = topology.describe().toString();
assertThat(topologyDescription, equalTo("Topologies:\n" + " Sub-topology: 0\n" + " Source: KTABLE-SOURCE-0000000016 (topics: [KTABLE-FK-JOIN-SUBSCRIPTION-RESPONSE-0000000014-topic])\n" + " --> KTABLE-FK-JOIN-SUBSCRIPTION-RESPONSE-RESOLVER-PROCESSOR-0000000017\n" + " Source: KSTREAM-SOURCE-0000000000 (topics: [input1])\n" + " --> KSTREAM-TOTABLE-0000000001\n" + " Processor: KTABLE-FK-JOIN-SUBSCRIPTION-RESPONSE-RESOLVER-PROCESSOR-0000000017 (stores: [KSTREAM-TOTABLE-STATE-STORE-0000000002])\n" + " --> KTABLE-FK-JOIN-OUTPUT-0000000018\n" + " <-- KTABLE-SOURCE-0000000016\n" + " Processor: KSTREAM-TOTABLE-0000000001 (stores: [KSTREAM-TOTABLE-STATE-STORE-0000000002])\n" + " --> KTABLE-FK-JOIN-SUBSCRIPTION-REGISTRATION-0000000007\n" + " <-- KSTREAM-SOURCE-0000000000\n" + " Processor: KTABLE-FK-JOIN-OUTPUT-0000000018 (stores: [])\n" + " --> KTABLE-TOSTREAM-0000000020\n" + " <-- KTABLE-FK-JOIN-SUBSCRIPTION-RESPONSE-RESOLVER-PROCESSOR-0000000017\n" + " Processor: KTABLE-FK-JOIN-SUBSCRIPTION-REGISTRATION-0000000007 (stores: [])\n" + " --> KTABLE-SINK-0000000008\n" + " <-- KSTREAM-TOTABLE-0000000001\n" + " Processor: KTABLE-TOSTREAM-0000000020 (stores: [])\n" + " --> KSTREAM-SINK-0000000021\n" + " <-- KTABLE-FK-JOIN-OUTPUT-0000000018\n" + " Sink: KSTREAM-SINK-0000000021 (topic: output)\n" + " <-- KTABLE-TOSTREAM-0000000020\n" + " Sink: KTABLE-SINK-0000000008 (topic: KTABLE-FK-JOIN-SUBSCRIPTION-REGISTRATION-0000000006-topic)\n" + " <-- KTABLE-FK-JOIN-SUBSCRIPTION-REGISTRATION-0000000007\n" + "\n" + " Sub-topology: 1\n" + " Source: KSTREAM-SOURCE-0000000003 (topics: [input2])\n" + " --> KSTREAM-TOTABLE-0000000004\n" + " Source: KTABLE-SOURCE-0000000009 (topics: [KTABLE-FK-JOIN-SUBSCRIPTION-REGISTRATION-0000000006-topic])\n" + " --> KTABLE-FK-JOIN-SUBSCRIPTION-PROCESSOR-0000000011\n" + " Processor: KSTREAM-TOTABLE-0000000004 (stores: [KSTREAM-TOTABLE-STATE-STORE-0000000005])\n" + " --> KTABLE-FK-JOIN-SUBSCRIPTION-PROCESSOR-0000000013\n" + " <-- KSTREAM-SOURCE-0000000003\n" + " Processor: KTABLE-FK-JOIN-SUBSCRIPTION-PROCESSOR-0000000011 (stores: [KTABLE-FK-JOIN-SUBSCRIPTION-STATE-STORE-0000000010])\n" + " --> KTABLE-FK-JOIN-SUBSCRIPTION-PROCESSOR-0000000012\n" + " <-- KTABLE-SOURCE-0000000009\n" + " Processor: KTABLE-FK-JOIN-SUBSCRIPTION-PROCESSOR-0000000012 (stores: [KSTREAM-TOTABLE-STATE-STORE-0000000005])\n" + " --> KTABLE-SINK-0000000015\n" + " <-- KTABLE-FK-JOIN-SUBSCRIPTION-PROCESSOR-0000000011\n" + " Processor: KTABLE-FK-JOIN-SUBSCRIPTION-PROCESSOR-0000000013 (stores: [KTABLE-FK-JOIN-SUBSCRIPTION-STATE-STORE-0000000010])\n" + " --> KTABLE-SINK-0000000015\n" + " <-- KSTREAM-TOTABLE-0000000004\n" + " Sink: KTABLE-SINK-0000000015 (topic: KTABLE-FK-JOIN-SUBSCRIPTION-RESPONSE-0000000014-topic)\n" + " <-- KTABLE-FK-JOIN-SUBSCRIPTION-PROCESSOR-0000000012, KTABLE-FK-JOIN-SUBSCRIPTION-PROCESSOR-0000000013\n\n"));
try (final TopologyTestDriver driver = new TopologyTestDriver(topology, props)) {
final TestInputTopic<String, String> left = driver.createInputTopic(input1, new StringSerializer(), new StringSerializer());
final TestInputTopic<String, String> right = driver.createInputTopic(input2, new StringSerializer(), new StringSerializer());
final TestOutputTopic<String, String> outputTopic = driver.createOutputTopic(output, new StringDeserializer(), new StringDeserializer());
// Pre-populate the RHS records. This test is all about what happens when we add/remove LHS records
right.pipeInput("rhs1", "rhsValue1");
right.pipeInput("rhs2", "rhsValue2");
// this unreferenced FK won't show up in any results
right.pipeInput("rhs3", "rhsValue3");
assertThat(outputTopic.readKeyValuesToMap(), is(emptyMap()));
left.pipeInput("lhs1", "lhsValue1|rhs1");
left.pipeInput("lhs2", "lhsValue2|rhs2");
final Map<String, String> expected = mkMap(mkEntry("lhs1", "(lhsValue1|rhs1,rhsValue1)"), mkEntry("lhs2", "(lhsValue2|rhs2,rhsValue2)"));
assertThat(outputTopic.readKeyValuesToMap(), is(expected));
// Add another reference to an existing FK
left.pipeInput("lhs3", "lhsValue3|rhs1");
assertThat(outputTopic.readKeyValuesToMap(), is(mkMap(mkEntry("lhs3", "(lhsValue3|rhs1,rhsValue1)"))));
left.pipeInput("lhs1", (String) null);
assertThat(outputTopic.readKeyValuesToMap(), is(mkMap(mkEntry("lhs1", null))));
}
}
Aggregations