Search in sources :

Example 16 with MessageStream

use of org.apache.samza.operators.MessageStream in project samza by apache.

the class KeyValueStoreExample method describe.

@Override
public void describe(StreamApplicationDescriptor appDescriptor) {
    KafkaSystemDescriptor trackingSystem = new KafkaSystemDescriptor("tracking");
    KafkaInputDescriptor<PageViewEvent> inputStreamDescriptor = trackingSystem.getInputDescriptor("pageViewEvent", new JsonSerdeV2<>(PageViewEvent.class));
    KafkaOutputDescriptor<KV<String, StatsOutput>> outputStreamDescriptor = trackingSystem.getOutputDescriptor("pageViewEventPerMember", KVSerde.of(new StringSerde(), new JsonSerdeV2<>(StatsOutput.class)));
    appDescriptor.withDefaultSystem(trackingSystem);
    MessageStream<PageViewEvent> pageViewEvents = appDescriptor.getInputStream(inputStreamDescriptor);
    OutputStream<KV<String, StatsOutput>> pageViewEventPerMember = appDescriptor.getOutputStream(outputStreamDescriptor);
    pageViewEvents.partitionBy(pve -> pve.getMemberId(), pve -> pve, KVSerde.of(new StringSerde(), new JsonSerdeV2<>(PageViewEvent.class)), "partitionBy").map(KV::getValue).flatMap(new MyStatsCounter()).map(stats -> KV.of(stats.memberId, stats)).sendTo(pageViewEventPerMember);
}
Also used : ApplicationRunner(org.apache.samza.runtime.ApplicationRunner) KafkaInputDescriptor(org.apache.samza.system.kafka.descriptors.KafkaInputDescriptor) CommandLine(org.apache.samza.util.CommandLine) Collection(java.util.Collection) KafkaSystemDescriptor(org.apache.samza.system.kafka.descriptors.KafkaSystemDescriptor) PageViewEvent(org.apache.samza.example.models.PageViewEvent) FlatMapFunction(org.apache.samza.operators.functions.FlatMapFunction) ArrayList(java.util.ArrayList) StringSerde(org.apache.samza.serializers.StringSerde) TimeUnit(java.util.concurrent.TimeUnit) Context(org.apache.samza.context.Context) List(java.util.List) KafkaOutputDescriptor(org.apache.samza.system.kafka.descriptors.KafkaOutputDescriptor) StreamApplicationDescriptor(org.apache.samza.application.descriptors.StreamApplicationDescriptor) Config(org.apache.samza.config.Config) ApplicationRunners(org.apache.samza.runtime.ApplicationRunners) JsonSerdeV2(org.apache.samza.serializers.JsonSerdeV2) KVSerde(org.apache.samza.serializers.KVSerde) StreamApplication(org.apache.samza.application.StreamApplication) KeyValueStore(org.apache.samza.storage.kv.KeyValueStore) KV(org.apache.samza.operators.KV) OutputStream(org.apache.samza.operators.OutputStream) MessageStream(org.apache.samza.operators.MessageStream) StringSerde(org.apache.samza.serializers.StringSerde) PageViewEvent(org.apache.samza.example.models.PageViewEvent) KV(org.apache.samza.operators.KV) KafkaSystemDescriptor(org.apache.samza.system.kafka.descriptors.KafkaSystemDescriptor) JsonSerdeV2(org.apache.samza.serializers.JsonSerdeV2)

Example 17 with MessageStream

use of org.apache.samza.operators.MessageStream in project samza by apache.

the class TestExecutionPlanner method createStreamGraphWithJoin.

private StreamGraphImpl createStreamGraphWithJoin() {
    /**
     * the graph looks like the following. number of partitions in parentheses. quotes indicate expected value.
     *
     *                               input1 (64) -> map -> join -> output1 (8)
     *                                                       |
     *          input2 (16) -> partitionBy ("64") -> filter -|
     *                                                       |
     * input3 (32) -> filter -> partitionBy ("64") -> map -> join -> output2 (16)
     *
     */
    StreamGraphImpl streamGraph = new StreamGraphImpl(runner, config);
    BiFunction msgBuilder = mock(BiFunction.class);
    MessageStream m1 = streamGraph.getInputStream("input1", msgBuilder).map(m -> m);
    MessageStream m2 = streamGraph.getInputStream("input2", msgBuilder).partitionBy(m -> "haha").filter(m -> true);
    MessageStream m3 = streamGraph.getInputStream("input3", msgBuilder).filter(m -> true).partitionBy(m -> "hehe").map(m -> m);
    Function mockFn = mock(Function.class);
    OutputStream<Object, Object, Object> output1 = streamGraph.getOutputStream("output1", mockFn, mockFn);
    OutputStream<Object, Object, Object> output2 = streamGraph.getOutputStream("output2", mockFn, mockFn);
    m1.join(m2, mock(JoinFunction.class), Duration.ofHours(2)).sendTo(output1);
    m3.join(m2, mock(JoinFunction.class), Duration.ofHours(1)).sendTo(output2);
    return streamGraph;
}
Also used : BiFunction(java.util.function.BiFunction) JobConfig(org.apache.samza.config.JobConfig) HashMap(java.util.HashMap) SystemStreamPartition(org.apache.samza.system.SystemStreamPartition) SystemStreamMetadata(org.apache.samza.system.SystemStreamMetadata) Function(java.util.function.Function) ArrayList(java.util.ArrayList) Duration(java.time.Duration) Map(java.util.Map) MapConfig(org.apache.samza.config.MapConfig) MessageStream(org.apache.samza.operators.MessageStream) Before(org.junit.Before) ApplicationRunner(org.apache.samza.runtime.ApplicationRunner) Windows(org.apache.samza.operators.windows.Windows) TaskConfig(org.apache.samza.config.TaskConfig) Collection(java.util.Collection) Partition(org.apache.samza.Partition) Set(java.util.Set) Assert.assertTrue(org.junit.Assert.assertTrue) StreamSpec(org.apache.samza.system.StreamSpec) Test(org.junit.Test) Mockito.when(org.mockito.Mockito.when) JoinFunction(org.apache.samza.operators.functions.JoinFunction) StreamGraphImpl(org.apache.samza.operators.StreamGraphImpl) List(java.util.List) Assert.assertFalse(org.junit.Assert.assertFalse) SystemAdmin(org.apache.samza.system.SystemAdmin) Config(org.apache.samza.config.Config) OutputStream(org.apache.samza.operators.OutputStream) Collections(java.util.Collections) Assert.assertEquals(org.junit.Assert.assertEquals) Mockito.mock(org.mockito.Mockito.mock) BiFunction(java.util.function.BiFunction) Function(java.util.function.Function) JoinFunction(org.apache.samza.operators.functions.JoinFunction) BiFunction(java.util.function.BiFunction) MessageStream(org.apache.samza.operators.MessageStream) StreamGraphImpl(org.apache.samza.operators.StreamGraphImpl)

Example 18 with MessageStream

use of org.apache.samza.operators.MessageStream in project samza by apache.

the class TestExecutionPlanner method createStreamGraphWithJoinAndWindow.

private StreamGraphImpl createStreamGraphWithJoinAndWindow() {
    StreamGraphImpl streamGraph = new StreamGraphImpl(runner, config);
    BiFunction msgBuilder = mock(BiFunction.class);
    MessageStream m1 = streamGraph.getInputStream("input1", msgBuilder).map(m -> m);
    MessageStream m2 = streamGraph.getInputStream("input2", msgBuilder).partitionBy(m -> "haha").filter(m -> true);
    MessageStream m3 = streamGraph.getInputStream("input3", msgBuilder).filter(m -> true).partitionBy(m -> "hehe").map(m -> m);
    Function mockFn = mock(Function.class);
    OutputStream<Object, Object, Object> output1 = streamGraph.getOutputStream("output1", mockFn, mockFn);
    OutputStream<Object, Object, Object> output2 = streamGraph.getOutputStream("output2", mockFn, mockFn);
    m1.map(m -> m).filter(m -> true).window(Windows.<Object, Object>keyedTumblingWindow(m -> m, Duration.ofMillis(8)));
    m2.map(m -> m).filter(m -> true).window(Windows.<Object, Object>keyedTumblingWindow(m -> m, Duration.ofMillis(16)));
    m1.join(m2, mock(JoinFunction.class), Duration.ofMillis(1600)).sendTo(output1);
    m3.join(m2, mock(JoinFunction.class), Duration.ofMillis(100)).sendTo(output2);
    m3.join(m2, mock(JoinFunction.class), Duration.ofMillis(252)).sendTo(output2);
    return streamGraph;
}
Also used : BiFunction(java.util.function.BiFunction) JobConfig(org.apache.samza.config.JobConfig) HashMap(java.util.HashMap) SystemStreamPartition(org.apache.samza.system.SystemStreamPartition) SystemStreamMetadata(org.apache.samza.system.SystemStreamMetadata) Function(java.util.function.Function) ArrayList(java.util.ArrayList) Duration(java.time.Duration) Map(java.util.Map) MapConfig(org.apache.samza.config.MapConfig) MessageStream(org.apache.samza.operators.MessageStream) Before(org.junit.Before) ApplicationRunner(org.apache.samza.runtime.ApplicationRunner) Windows(org.apache.samza.operators.windows.Windows) TaskConfig(org.apache.samza.config.TaskConfig) Collection(java.util.Collection) Partition(org.apache.samza.Partition) Set(java.util.Set) Assert.assertTrue(org.junit.Assert.assertTrue) StreamSpec(org.apache.samza.system.StreamSpec) Test(org.junit.Test) Mockito.when(org.mockito.Mockito.when) JoinFunction(org.apache.samza.operators.functions.JoinFunction) StreamGraphImpl(org.apache.samza.operators.StreamGraphImpl) List(java.util.List) Assert.assertFalse(org.junit.Assert.assertFalse) SystemAdmin(org.apache.samza.system.SystemAdmin) Config(org.apache.samza.config.Config) OutputStream(org.apache.samza.operators.OutputStream) Collections(java.util.Collections) Assert.assertEquals(org.junit.Assert.assertEquals) Mockito.mock(org.mockito.Mockito.mock) BiFunction(java.util.function.BiFunction) Function(java.util.function.Function) JoinFunction(org.apache.samza.operators.functions.JoinFunction) BiFunction(java.util.function.BiFunction) MessageStream(org.apache.samza.operators.MessageStream) StreamGraphImpl(org.apache.samza.operators.StreamGraphImpl)

Example 19 with MessageStream

use of org.apache.samza.operators.MessageStream in project samza by apache.

the class ExecutionPlanner method calculateJoinInputPartitions.

/**
   * Calculate the partitions for the input streams of join operators
   */
/* package private */
static void calculateJoinInputPartitions(StreamGraphImpl streamGraph, JobGraph jobGraph) {
    // mapping from a source stream to all join specs reachable from it
    Multimap<OperatorSpec, StreamEdge> joinSpecToStreamEdges = HashMultimap.create();
    // reverse mapping of the above
    Multimap<StreamEdge, OperatorSpec> streamEdgeToJoinSpecs = HashMultimap.create();
    // Mapping from the output stream to the join spec. Since StreamGraph creates two partial join operators for a join and they
    // will have the same output stream, this mapping is used to choose one of them as the unique join spec representing this join
    // (who register first in the map wins).
    Map<MessageStream, OperatorSpec> outputStreamToJoinSpec = new HashMap<>();
    // A queue of joins with known input partitions
    Queue<OperatorSpec> joinQ = new LinkedList<>();
    // The visited set keeps track of the join specs that have been already inserted in the queue before
    Set<OperatorSpec> visited = new HashSet<>();
    streamGraph.getInputStreams().entrySet().forEach(entry -> {
        StreamEdge streamEdge = jobGraph.getOrCreateStreamEdge(entry.getKey());
        findReachableJoins(entry.getValue(), streamEdge, joinSpecToStreamEdges, streamEdgeToJoinSpecs, outputStreamToJoinSpec, joinQ, visited);
    });
    // At this point, joinQ contains joinSpecs where at least one of the input stream edge partitions is known.
    while (!joinQ.isEmpty()) {
        OperatorSpec join = joinQ.poll();
        int partitions = StreamEdge.PARTITIONS_UNKNOWN;
        // loop through the input streams to the join and find the partition count
        for (StreamEdge edge : joinSpecToStreamEdges.get(join)) {
            int edgePartitions = edge.getPartitionCount();
            if (edgePartitions != StreamEdge.PARTITIONS_UNKNOWN) {
                if (partitions == StreamEdge.PARTITIONS_UNKNOWN) {
                    //if the partition is not assigned
                    partitions = edgePartitions;
                } else if (partitions != edgePartitions) {
                    throw new SamzaException(String.format("Unable to resolve input partitions of stream %s for join. Expected: %d, Actual: %d", edge.getFormattedSystemStream(), partitions, edgePartitions));
                }
            }
        }
        // assign the partition count for intermediate streams
        for (StreamEdge edge : joinSpecToStreamEdges.get(join)) {
            if (edge.getPartitionCount() <= 0) {
                edge.setPartitionCount(partitions);
                // find other joins can be inferred by setting this edge
                for (OperatorSpec op : streamEdgeToJoinSpecs.get(edge)) {
                    if (!visited.contains(op)) {
                        joinQ.add(op);
                        visited.add(op);
                    }
                }
            }
        }
    }
}
Also used : HashMap(java.util.HashMap) SamzaException(org.apache.samza.SamzaException) LinkedList(java.util.LinkedList) OperatorSpec(org.apache.samza.operators.spec.OperatorSpec) PartialJoinOperatorSpec(org.apache.samza.operators.spec.PartialJoinOperatorSpec) MessageStream(org.apache.samza.operators.MessageStream) HashSet(java.util.HashSet)

Example 20 with MessageStream

use of org.apache.samza.operators.MessageStream in project samza by apache.

the class BroadcastExample method describe.

@Override
public void describe(StreamApplicationDescriptor appDescriptor) {
    KVSerde<String, PageViewEvent> serde = KVSerde.of(new StringSerde("UTF-8"), new JsonSerdeV2<>(PageViewEvent.class));
    KafkaSystemDescriptor trackingSystem = new KafkaSystemDescriptor("tracking");
    KafkaInputDescriptor<KV<String, PageViewEvent>> pageViewEvent = trackingSystem.getInputDescriptor("pageViewEvent", serde);
    KafkaOutputDescriptor<KV<String, PageViewEvent>> outStream1 = trackingSystem.getOutputDescriptor("outStream1", serde);
    KafkaOutputDescriptor<KV<String, PageViewEvent>> outStream2 = trackingSystem.getOutputDescriptor("outStream2", serde);
    KafkaOutputDescriptor<KV<String, PageViewEvent>> outStream3 = trackingSystem.getOutputDescriptor("outStream3", serde);
    MessageStream<KV<String, PageViewEvent>> inputStream = appDescriptor.getInputStream(pageViewEvent);
    inputStream.filter(m -> m.key.equals("key1")).sendTo(appDescriptor.getOutputStream(outStream1));
    inputStream.filter(m -> m.key.equals("key2")).sendTo(appDescriptor.getOutputStream(outStream2));
    inputStream.filter(m -> m.key.equals("key3")).sendTo(appDescriptor.getOutputStream(outStream3));
}
Also used : ApplicationRunner(org.apache.samza.runtime.ApplicationRunner) KafkaInputDescriptor(org.apache.samza.system.kafka.descriptors.KafkaInputDescriptor) CommandLine(org.apache.samza.util.CommandLine) KafkaSystemDescriptor(org.apache.samza.system.kafka.descriptors.KafkaSystemDescriptor) PageViewEvent(org.apache.samza.example.models.PageViewEvent) StringSerde(org.apache.samza.serializers.StringSerde) KafkaOutputDescriptor(org.apache.samza.system.kafka.descriptors.KafkaOutputDescriptor) StreamApplicationDescriptor(org.apache.samza.application.descriptors.StreamApplicationDescriptor) Config(org.apache.samza.config.Config) ApplicationRunners(org.apache.samza.runtime.ApplicationRunners) JsonSerdeV2(org.apache.samza.serializers.JsonSerdeV2) KVSerde(org.apache.samza.serializers.KVSerde) StreamApplication(org.apache.samza.application.StreamApplication) KV(org.apache.samza.operators.KV) MessageStream(org.apache.samza.operators.MessageStream) StringSerde(org.apache.samza.serializers.StringSerde) PageViewEvent(org.apache.samza.example.models.PageViewEvent) KV(org.apache.samza.operators.KV) KafkaSystemDescriptor(org.apache.samza.system.kafka.descriptors.KafkaSystemDescriptor)

Aggregations

MessageStream (org.apache.samza.operators.MessageStream)34 Config (org.apache.samza.config.Config)22 KVSerde (org.apache.samza.serializers.KVSerde)21 Duration (java.time.Duration)19 HashMap (java.util.HashMap)19 OutputStream (org.apache.samza.operators.OutputStream)19 KV (org.apache.samza.operators.KV)18 Map (java.util.Map)17 ArrayList (java.util.ArrayList)16 List (java.util.List)16 StringSerde (org.apache.samza.serializers.StringSerde)16 Test (org.junit.Test)16 Collection (java.util.Collection)14 StreamApplicationDescriptorImpl (org.apache.samza.application.descriptors.StreamApplicationDescriptorImpl)14 JobConfig (org.apache.samza.config.JobConfig)14 MapConfig (org.apache.samza.config.MapConfig)14 Windows (org.apache.samza.operators.windows.Windows)13 Collections (java.util.Collections)12 JoinFunction (org.apache.samza.operators.functions.JoinFunction)12 ApplicationRunner (org.apache.samza.runtime.ApplicationRunner)12