use of org.apache.kafka.streams.StreamsBuilder in project kafka-streams-examples by confluentinc.
the class TopArticlesLambdaExample method buildTopArticlesStream.
static KafkaStreams buildTopArticlesStream(final String bootstrapServers, final String schemaRegistryUrl, final String stateDir) throws IOException {
final Properties streamsConfiguration = new Properties();
// Give the Streams application a unique name. The name must be unique in the Kafka cluster
// against which the application is run.
streamsConfiguration.put(StreamsConfig.APPLICATION_ID_CONFIG, "top-articles-lambda-example");
streamsConfiguration.put(StreamsConfig.CLIENT_ID_CONFIG, "top-articles-lambda-example-client");
// Where to find Kafka broker(s).
streamsConfiguration.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers);
// Where to find the Confluent schema registry instance(s)
streamsConfiguration.put(AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG, schemaRegistryUrl);
// Specify default (de)serializers for record keys and for record values.
streamsConfiguration.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass().getName());
streamsConfiguration.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, GenericAvroSerde.class);
streamsConfiguration.put(StreamsConfig.STATE_DIR_CONFIG, stateDir);
// Records should be flushed every 10 seconds. This is less than the default
// in order to keep this example interactive.
streamsConfiguration.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG, 10 * 1000);
// Serdes used in this example
final Serde<String> stringSerde = Serdes.String();
final Map<String, String> serdeConfig = Collections.singletonMap(AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG, schemaRegistryUrl);
final Serde<GenericRecord> keyAvroSerde = new GenericAvroSerde();
keyAvroSerde.configure(serdeConfig, true);
final Serde<GenericRecord> valueAvroSerde = new GenericAvroSerde();
valueAvroSerde.configure(serdeConfig, false);
final Serde<Windowed<String>> windowedStringSerde = new WindowedSerde<>(stringSerde);
final StreamsBuilder builder = new StreamsBuilder();
final KStream<byte[], GenericRecord> views = builder.stream(PAGE_VIEWS);
final InputStream statsSchema = TopArticlesLambdaExample.class.getClassLoader().getResourceAsStream("avro/io/confluent/examples/streams/pageviewstats.avsc");
final Schema schema = new Schema.Parser().parse(statsSchema);
final KStream<GenericRecord, GenericRecord> articleViews = views.filter((dummy, record) -> isArticle(record)).map((dummy, article) -> {
final GenericRecord clone = new GenericData.Record(article.getSchema());
clone.put("user", "user");
clone.put("page", article.get("page"));
clone.put("industry", article.get("industry"));
return new KeyValue<>(clone, clone);
});
final KTable<Windowed<GenericRecord>, Long> viewCounts = articleViews.groupByKey(Serialized.with(keyAvroSerde, valueAvroSerde)).windowedBy(TimeWindows.of(TimeUnit.MINUTES.toMillis(60))).count();
final Comparator<GenericRecord> comparator = (o1, o2) -> (int) ((Long) o2.get("count") - (Long) o1.get("count"));
final KTable<Windowed<String>, PriorityQueue<GenericRecord>> allViewCounts = viewCounts.groupBy(// the selector
(windowedArticle, count) -> {
// project on the industry field for key
Windowed<String> windowedIndustry = new Windowed<>(windowedArticle.key().get("industry").toString(), windowedArticle.window());
// add the page into the value
GenericRecord viewStats = new GenericData.Record(schema);
viewStats.put("page", windowedArticle.key().get("page"));
viewStats.put("user", "user");
viewStats.put("industry", windowedArticle.key().get("industry"));
viewStats.put("count", count);
return new KeyValue<>(windowedIndustry, viewStats);
}, Serialized.with(windowedStringSerde, valueAvroSerde)).aggregate(// the initializer
() -> new PriorityQueue<>(comparator), // the "add" aggregator
(windowedIndustry, record, queue) -> {
queue.add(record);
return queue;
}, // the "remove" aggregator
(windowedIndustry, record, queue) -> {
queue.remove(record);
return queue;
}, Materialized.with(windowedStringSerde, new PriorityQueueSerde<>(comparator, valueAvroSerde)));
final int topN = 100;
final KTable<Windowed<String>, String> topViewCounts = allViewCounts.mapValues(queue -> {
final StringBuilder sb = new StringBuilder();
for (int i = 0; i < topN; i++) {
final GenericRecord record = queue.poll();
if (record == null) {
break;
}
sb.append(record.get("page").toString());
sb.append("\n");
}
return sb.toString();
});
topViewCounts.toStream().to(TOP_NEWS_PER_INDUSTRY_TOPIC, Produced.with(windowedStringSerde, stringSerde));
return new KafkaStreams(builder.build(), streamsConfiguration);
}
use of org.apache.kafka.streams.StreamsBuilder in project kafka-streams-examples by confluentinc.
the class WikipediaFeedAvroExample method buildWikipediaFeed.
static KafkaStreams buildWikipediaFeed(final String bootstrapServers, final String schemaRegistryUrl, final String stateDir) {
final Properties streamsConfiguration = new Properties();
// Give the Streams application a unique name. The name must be unique in the Kafka cluster
// against which the application is run.
streamsConfiguration.put(StreamsConfig.APPLICATION_ID_CONFIG, "wordcount-avro-example");
streamsConfiguration.put(StreamsConfig.CLIENT_ID_CONFIG, "wordcount-avro-example-client");
// Where to find Kafka broker(s).
streamsConfiguration.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers);
// Where to find the Confluent schema registry instance(s)
streamsConfiguration.put(AbstractKafkaAvroSerDeConfig.SCHEMA_REGISTRY_URL_CONFIG, schemaRegistryUrl);
// Specify default (de)serializers for record keys and for record values.
streamsConfiguration.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass().getName());
streamsConfiguration.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, SpecificAvroSerde.class);
streamsConfiguration.put(StreamsConfig.STATE_DIR_CONFIG, stateDir);
streamsConfiguration.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
// Records should be flushed every 10 seconds. This is less than the default
// in order to keep this example interactive.
streamsConfiguration.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG, 10 * 1000);
final Serde<String> stringSerde = Serdes.String();
final Serde<Long> longSerde = Serdes.Long();
final StreamsBuilder builder = new StreamsBuilder();
// read the source stream
final KStream<String, WikiFeed> feeds = builder.stream(WIKIPEDIA_FEED);
// aggregate the new feed counts of by user
final KTable<String, Long> aggregated = feeds.filter(new Predicate<String, WikiFeed>() {
@Override
public boolean test(final String dummy, final WikiFeed value) {
return value.getIsNew();
}
}).map(new KeyValueMapper<String, WikiFeed, KeyValue<String, WikiFeed>>() {
@Override
public KeyValue<String, WikiFeed> apply(final String key, final WikiFeed value) {
return new KeyValue<>(value.getUser(), value);
}
}).groupByKey().count();
// write to the result topic, need to override serdes
aggregated.toStream().to(WIKIPEDIA_STATS, Produced.with(stringSerde, longSerde));
return new KafkaStreams(builder.build(), streamsConfiguration);
}
use of org.apache.kafka.streams.StreamsBuilder in project kafka-streams-examples by confluentinc.
the class WordCountLambdaExample method main.
public static void main(final String[] args) throws Exception {
final String bootstrapServers = args.length > 0 ? args[0] : "localhost:9092";
final Properties streamsConfiguration = new Properties();
// Give the Streams application a unique name. The name must be unique in the Kafka cluster
// against which the application is run.
streamsConfiguration.put(StreamsConfig.APPLICATION_ID_CONFIG, "wordcount-lambda-example");
streamsConfiguration.put(StreamsConfig.CLIENT_ID_CONFIG, "wordcount-lambda-example-client");
// Where to find Kafka broker(s).
streamsConfiguration.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, bootstrapServers);
// Specify default (de)serializers for record keys and for record values.
streamsConfiguration.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass().getName());
streamsConfiguration.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass().getName());
// Records should be flushed every 10 seconds. This is less than the default
// in order to keep this example interactive.
streamsConfiguration.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG, 10 * 1000);
// For illustrative purposes we disable record caches
streamsConfiguration.put(StreamsConfig.CACHE_MAX_BYTES_BUFFERING_CONFIG, 0);
// Set up serializers and deserializers, which we will use for overriding the default serdes
// specified above.
final Serde<String> stringSerde = Serdes.String();
final Serde<Long> longSerde = Serdes.Long();
// In the subsequent lines we define the processing topology of the Streams application.
final StreamsBuilder builder = new StreamsBuilder();
// Construct a `KStream` from the input topic "streams-plaintext-input", where message values
// represent lines of text (for the sake of this example, we ignore whatever may be stored
// in the message keys).
//
// Note: We could also just call `builder.stream("streams-plaintext-input")` if we wanted to leverage
// the default serdes specified in the Streams configuration above, because these defaults
// match what's in the actual topic. However we explicitly set the deserializers in the
// call to `stream()` below in order to show how that's done, too.
final KStream<String, String> textLines = builder.stream("streams-plaintext-input");
final Pattern pattern = Pattern.compile("\\W+", Pattern.UNICODE_CHARACTER_CLASS);
final KTable<String, Long> wordCounts = textLines.flatMapValues(value -> Arrays.asList(pattern.split(value.toLowerCase()))).groupBy((key, word) -> word).count();
// Write the `KTable<String, Long>` to the output topic.
wordCounts.toStream().to("streams-wordcount-output", Produced.with(stringSerde, longSerde));
// Now that we have finished the definition of the processing topology we can actually run
// it via `start()`. The Streams application as a whole can be launched just like any
// normal Java application that has a `main()` method.
final KafkaStreams streams = new KafkaStreams(builder.build(), streamsConfiguration);
// Always (and unconditionally) clean local state prior to starting the processing topology.
// We opt for this unconditional call here because this will make it easier for you to play around with the example
// when resetting the application for doing a re-run (via the Application Reset Tool,
// http://docs.confluent.io/current/streams/developer-guide.html#application-reset-tool).
//
// The drawback of cleaning up local state prior is that your app must rebuilt its local state from scratch, which
// will take time and will require reading all the state-relevant data from the Kafka cluster over the network.
// Thus in a production scenario you typically do not want to clean up always as we do here but rather only when it
// is truly needed, i.e., only under certain conditions (e.g., the presence of a command line flag for your app).
// See `ApplicationResetExample.java` for a production-like example.
streams.cleanUp();
streams.start();
// Add shutdown hook to respond to SIGTERM and gracefully close Kafka Streams
Runtime.getRuntime().addShutdownHook(new Thread(streams::close));
}
use of org.apache.kafka.streams.StreamsBuilder in project kafka-streams-examples by confluentinc.
the class EventDeduplicationLambdaIntegrationTest method shouldRemoveDuplicatesFromTheInput.
@Test
public void shouldRemoveDuplicatesFromTheInput() throws Exception {
// e.g. "4ff3cb44-abcb-46e3-8f9a-afb7cc74fbb8"
String firstId = UUID.randomUUID().toString();
String secondId = UUID.randomUUID().toString();
String thirdId = UUID.randomUUID().toString();
List<String> inputValues = Arrays.asList(firstId, secondId, firstId, firstId, secondId, thirdId, thirdId, firstId, secondId);
List<String> expectedValues = Arrays.asList(firstId, secondId, thirdId);
//
// Step 1: Configure and start the processor topology.
//
StreamsBuilder builder = new StreamsBuilder();
Properties streamsConfiguration = new Properties();
streamsConfiguration.put(StreamsConfig.APPLICATION_ID_CONFIG, "deduplication-lambda-integration-test");
streamsConfiguration.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers());
streamsConfiguration.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.ByteArray().getClass().getName());
streamsConfiguration.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.String().getClass().getName());
// The commit interval for flushing records to state stores and downstream must be lower than
// this integration test's timeout (30 secs) to ensure we observe the expected processing results.
streamsConfiguration.put(StreamsConfig.COMMIT_INTERVAL_MS_CONFIG, TimeUnit.SECONDS.toMillis(10));
streamsConfiguration.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
// Use a temporary directory for storing state, which will be automatically removed after the test.
streamsConfiguration.put(StreamsConfig.STATE_DIR_CONFIG, TestUtils.tempDirectory().getAbsolutePath());
// How long we "remember" an event. During this time, any incoming duplicates of the event
// will be, well, dropped, thereby de-duplicating the input data.
//
// The actual value depends on your use case. To reduce memory and disk usage, you could
// decrease the size to purge old windows more frequently at the cost of potentially missing out
// on de-duplicating late-arriving records.
long maintainDurationPerEventInMs = TimeUnit.MINUTES.toMillis(10);
// The number of segments has no impact on "correctness".
// Using more segments implies larger overhead but allows for more fined grained record expiration
// Note: the specified retention time is a _minimum_ time span and no strict upper time bound
int numberOfSegments = 3;
// retention period must be at least window size -- for this use case, we don't need a longer retention period
// and thus just use the window size as retention time
long retentionPeriod = maintainDurationPerEventInMs;
StoreBuilder<WindowStore<String, Long>> dedupStoreBuilder = Stores.windowStoreBuilder(Stores.persistentWindowStore(storeName, retentionPeriod, numberOfSegments, maintainDurationPerEventInMs, false), Serdes.String(), Serdes.Long());
builder.addStateStore(dedupStoreBuilder);
KStream<byte[], String> input = builder.stream(inputTopic);
KStream<byte[], String> deduplicated = input.transform(// function as needed.
() -> new DeduplicationTransformer<>(maintainDurationPerEventInMs, (key, value) -> value), storeName);
deduplicated.to(outputTopic);
KafkaStreams streams = new KafkaStreams(builder.build(), streamsConfiguration);
streams.start();
//
// Step 2: Produce some input data to the input topic.
//
Properties producerConfig = new Properties();
producerConfig.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers());
producerConfig.put(ProducerConfig.ACKS_CONFIG, "all");
producerConfig.put(ProducerConfig.RETRIES_CONFIG, 0);
producerConfig.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class);
producerConfig.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
IntegrationTestUtils.produceValuesSynchronously(inputTopic, inputValues, producerConfig);
//
// Step 3: Verify the application's output data.
//
Properties consumerConfig = new Properties();
consumerConfig.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers());
consumerConfig.put(ConsumerConfig.GROUP_ID_CONFIG, "deduplication-integration-test-standard-consumer");
consumerConfig.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
consumerConfig.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class);
consumerConfig.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, StringDeserializer.class);
List<String> actualValues = IntegrationTestUtils.waitUntilMinValuesRecordsReceived(consumerConfig, outputTopic, expectedValues.size());
streams.close();
assertThat(actualValues).containsExactlyElementsOf(expectedValues);
}
use of org.apache.kafka.streams.StreamsBuilder in project kafka-streams-examples by confluentinc.
the class HandlingCorruptedInputRecordsIntegrationTest method shouldIgnoreCorruptInputRecords.
@Test
public void shouldIgnoreCorruptInputRecords() throws Exception {
List<Long> inputValues = Arrays.asList(1L, 2L, 3L);
List<Long> expectedValues = inputValues.stream().map(x -> 2 * x).collect(Collectors.toList());
//
// Step 1: Configure and start the processor topology.
//
StreamsBuilder builder = new StreamsBuilder();
Properties streamsConfiguration = new Properties();
streamsConfiguration.put(StreamsConfig.APPLICATION_ID_CONFIG, "failure-handling-integration-test");
streamsConfiguration.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers());
streamsConfiguration.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.ByteArray().getClass().getName());
streamsConfiguration.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, Serdes.ByteArray().getClass().getName());
streamsConfiguration.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
Serde<String> stringSerde = Serdes.String();
Serde<Long> longSerde = Serdes.Long();
KStream<byte[], byte[]> input = builder.stream(inputTopic);
// Note how the returned stream is of type `KStream<String, Long>`.
KStream<String, Long> doubled = input.flatMap((k, v) -> {
try {
// Attempt deserialization
String key = stringSerde.deserializer().deserialize("input-topic", k);
long value = longSerde.deserializer().deserialize("input-topic", v);
// checking.
return Collections.singletonList(KeyValue.pair(key, 2 * value));
} catch (SerializationException e) {
// Ignore/skip the corrupted record by catching the exception.
// Optionally, we can log the fact that we did so:
System.err.println("Could not deserialize record: " + e.getMessage());
}
return Collections.emptyList();
});
// Write the processing results (which was generated from valid records only) to Kafka.
doubled.to(outputTopic, Produced.with(stringSerde, longSerde));
KafkaStreams streams = new KafkaStreams(builder.build(), streamsConfiguration);
streams.start();
//
// Step 2: Produce some corrupt input data to the input topic.
//
Properties producerConfigForCorruptRecords = new Properties();
producerConfigForCorruptRecords.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers());
producerConfigForCorruptRecords.put(ProducerConfig.ACKS_CONFIG, "all");
producerConfigForCorruptRecords.put(ProducerConfig.RETRIES_CONFIG, 0);
producerConfigForCorruptRecords.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class);
producerConfigForCorruptRecords.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, StringSerializer.class);
IntegrationTestUtils.produceValuesSynchronously(inputTopic, Collections.singletonList("corrupt"), producerConfigForCorruptRecords);
//
// Step 3: Produce some (valid) input data to the input topic.
//
Properties producerConfig = new Properties();
producerConfig.put(ProducerConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers());
producerConfig.put(ProducerConfig.ACKS_CONFIG, "all");
producerConfig.put(ProducerConfig.RETRIES_CONFIG, 0);
producerConfig.put(ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG, ByteArraySerializer.class);
producerConfig.put(ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG, LongSerializer.class);
IntegrationTestUtils.produceValuesSynchronously(inputTopic, inputValues, producerConfig);
//
// Step 4: Verify the application's output data.
//
Properties consumerConfig = new Properties();
consumerConfig.put(ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG, CLUSTER.bootstrapServers());
consumerConfig.put(ConsumerConfig.GROUP_ID_CONFIG, "map-function-lambda-integration-test-standard-consumer");
consumerConfig.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest");
consumerConfig.put(ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG, ByteArrayDeserializer.class);
consumerConfig.put(ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG, LongDeserializer.class);
List<Long> actualValues = IntegrationTestUtils.waitUntilMinValuesRecordsReceived(consumerConfig, outputTopic, expectedValues.size());
streams.close();
assertThat(actualValues).isEqualTo(expectedValues);
}
Aggregations