Search in sources :

Example 26 with MessageStream

use of org.apache.samza.operators.MessageStream in project samza by apache.

the class TestPartitionByOperatorSpec method testPartitionBy.

@Test
public void testPartitionBy() {
    MapFunction<Object, String> keyFn = m -> m.toString();
    MapFunction<Object, Object> valueFn = m -> m;
    KVSerde<Object, Object> partitionBySerde = KVSerde.of(new NoOpSerde<>(), new NoOpSerde<>());
    StreamApplicationDescriptorImpl streamAppDesc = new StreamApplicationDescriptorImpl(appDesc -> {
        MessageStream inputStream = appDesc.getInputStream(testInputDescriptor);
        inputStream.partitionBy(keyFn, valueFn, partitionBySerde, testRepartitionedStreamName);
    }, getConfig());
    assertEquals(2, streamAppDesc.getInputOperators().size());
    Map<String, InputOperatorSpec> inputOpSpecs = streamAppDesc.getInputOperators();
    assertTrue(inputOpSpecs.keySet().contains(String.format("%s-%s-partition_by-%s", testJobName, testJobId, testRepartitionedStreamName)));
    InputOperatorSpec inputOpSpec = inputOpSpecs.get(String.format("%s-%s-partition_by-%s", testJobName, testJobId, testRepartitionedStreamName));
    assertEquals(String.format("%s-%s-partition_by-%s", testJobName, testJobId, testRepartitionedStreamName), inputOpSpec.getStreamId());
    assertTrue(inputOpSpec.getKeySerde() instanceof NoOpSerde);
    assertTrue(inputOpSpec.getValueSerde() instanceof NoOpSerde);
    assertTrue(inputOpSpec.isKeyed());
    assertNull(inputOpSpec.getScheduledFn());
    assertNull(inputOpSpec.getWatermarkFn());
    InputOperatorSpec originInputSpec = inputOpSpecs.get(testInputDescriptor.getStreamId());
    assertTrue(originInputSpec.getRegisteredOperatorSpecs().toArray()[0] instanceof PartitionByOperatorSpec);
    PartitionByOperatorSpec reparOpSpec = (PartitionByOperatorSpec) originInputSpec.getRegisteredOperatorSpecs().toArray()[0];
    assertEquals(reparOpSpec.getOpId(), String.format("%s-%s-partition_by-%s", testJobName, testJobId, testRepartitionedStreamName));
    assertEquals(reparOpSpec.getKeyFunction(), keyFn);
    assertEquals(reparOpSpec.getValueFunction(), valueFn);
    assertEquals(reparOpSpec.getOutputStream().getStreamId(), reparOpSpec.getOpId());
    assertNull(reparOpSpec.getScheduledFn());
    assertNull(reparOpSpec.getWatermarkFn());
}
Also used : StreamApplicationDescriptorImpl(org.apache.samza.application.descriptors.StreamApplicationDescriptorImpl) ScheduledFunction(org.apache.samza.operators.functions.ScheduledFunction) Assert.assertNotNull(org.junit.Assert.assertNotNull) Collection(java.util.Collection) GenericSystemDescriptor(org.apache.samza.system.descriptors.GenericSystemDescriptor) JobConfig(org.apache.samza.config.JobConfig) Assert.assertTrue(org.junit.Assert.assertTrue) HashMap(java.util.HashMap) Scheduler(org.apache.samza.operators.Scheduler) Serde(org.apache.samza.serializers.Serde) Test(org.junit.Test) GenericInputDescriptor(org.apache.samza.system.descriptors.GenericInputDescriptor) OperatorSpecGraph(org.apache.samza.operators.OperatorSpecGraph) MapFunction(org.apache.samza.operators.functions.MapFunction) WatermarkFunction(org.apache.samza.operators.functions.WatermarkFunction) Assert.assertNull(org.junit.Assert.assertNull) Map(java.util.Map) Config(org.apache.samza.config.Config) KVSerde(org.apache.samza.serializers.KVSerde) MapConfig(org.apache.samza.config.MapConfig) NoOpSerde(org.apache.samza.serializers.NoOpSerde) Assert.assertEquals(org.junit.Assert.assertEquals) MessageStream(org.apache.samza.operators.MessageStream) Mockito.mock(org.mockito.Mockito.mock) StreamApplicationDescriptorImpl(org.apache.samza.application.descriptors.StreamApplicationDescriptorImpl) MessageStream(org.apache.samza.operators.MessageStream) NoOpSerde(org.apache.samza.serializers.NoOpSerde) Test(org.junit.Test)

Example 27 with MessageStream

use of org.apache.samza.operators.MessageStream in project samza by apache.

the class JoinTranslator method getTable.

private Table getTable(JoinInputNode tableNode, TranslatorContext context) {
    SqlIOConfig sourceTableConfig = resolveSQlIOForTable(tableNode.getRelNode(), context.getExecutionContext().getSamzaSqlApplicationConfig().getInputSystemStreamConfigBySource());
    if (sourceTableConfig == null || !sourceTableConfig.getTableDescriptor().isPresent()) {
        String errMsg = "Failed to resolve table source in join operation: node=" + tableNode.getRelNode();
        log.error(errMsg);
        throw new SamzaException(errMsg);
    }
    Table<KV<SamzaSqlRelRecord, SamzaSqlRelMessage>> table = context.getStreamAppDescriptor().getTable(sourceTableConfig.getTableDescriptor().get());
    if (tableNode.isRemoteTable()) {
        return table;
    }
    // If local table, load the table.
    // Load the local table with the fields in the join condition as composite key and relational message as the value.
    // Send the messages from the input stream denoted as 'table' to the created table store.
    MessageStream<SamzaSqlRelMessage> relOutputStream = context.getMessageStream(tableNode.getRelNode().getId());
    SamzaSqlRelRecordSerdeFactory.SamzaSqlRelRecordSerde keySerde = (SamzaSqlRelRecordSerdeFactory.SamzaSqlRelRecordSerde) new SamzaSqlRelRecordSerdeFactory().getSerde(null, null);
    SamzaSqlRelMessageSerdeFactory.SamzaSqlRelMessageSerde valueSerde = (SamzaSqlRelMessageSerdeFactory.SamzaSqlRelMessageSerde) new SamzaSqlRelMessageSerdeFactory().getSerde(null, null);
    List<Integer> tableKeyIds = tableNode.getKeyIds();
    // Let's always repartition by the join fields as key before sending the key and value to the table.
    // We need to repartition the stream denoted as table to ensure that both the stream and table that are joined
    // have the same partitioning scheme with the same partition key and number. Please note that bootstrap semantic is
    // not propagated to the intermediate streams. Please refer SAMZA-1613 for more details on this. Subsequently, the
    // results are consistent only after the local table is caught up.
    relOutputStream.partitionBy(m -> createSamzaSqlCompositeKey(m, tableKeyIds), m -> m, KVSerde.of(keySerde, valueSerde), intermediateStreamPrefix + "table_" + logicalOpId).sendTo(table);
    return table;
}
Also used : SqlIOConfig(org.apache.samza.sql.interfaces.SqlIOConfig) TableScan(org.apache.calcite.rel.core.TableScan) LogicalFilter(org.apache.calcite.rel.logical.LogicalFilter) LoggerFactory(org.slf4j.LoggerFactory) RelOptUtil(org.apache.calcite.plan.RelOptUtil) ArrayList(java.util.ArrayList) LogicalJoin(org.apache.calcite.rel.logical.LogicalJoin) SamzaSqlRelRecordSerdeFactory(org.apache.samza.sql.serializers.SamzaSqlRelRecordSerdeFactory) RexNode(org.apache.calcite.rex.RexNode) Map(java.util.Map) SamzaSqlRelMessage(org.apache.samza.sql.data.SamzaSqlRelMessage) LinkedList(java.util.LinkedList) KV(org.apache.samza.operators.KV) MessageStream(org.apache.samza.operators.MessageStream) Table(org.apache.samza.table.Table) SqlKind(org.apache.calcite.sql.SqlKind) Logger(org.slf4j.Logger) LogicalProject(org.apache.calcite.rel.logical.LogicalProject) SqlTypeName(org.apache.calcite.sql.type.SqlTypeName) StreamTableJoinFunction(org.apache.samza.operators.functions.StreamTableJoinFunction) RexLiteral(org.apache.calcite.rex.RexLiteral) SqlExplainLevel(org.apache.calcite.sql.SqlExplainLevel) SamzaSqlRelMessage.getSamzaSqlCompositeKeyFieldNames(org.apache.samza.sql.data.SamzaSqlRelMessage.getSamzaSqlCompositeKeyFieldNames) RelNode(org.apache.calcite.rel.RelNode) Collectors(java.util.stream.Collectors) SamzaSqlRelMessage.createSamzaSqlCompositeKey(org.apache.samza.sql.data.SamzaSqlRelMessage.createSamzaSqlCompositeKey) SamzaException(org.apache.samza.SamzaException) SqlIOConfig(org.apache.samza.sql.interfaces.SqlIOConfig) RexInputRef(org.apache.calcite.rex.RexInputRef) List(java.util.List) Validate(org.apache.commons.lang3.Validate) SamzaSqlRelRecord(org.apache.samza.sql.SamzaSqlRelRecord) HepRelVertex(org.apache.calcite.plan.hep.HepRelVertex) JoinRelType(org.apache.calcite.rel.core.JoinRelType) Preconditions(com.google.common.base.Preconditions) VisibleForTesting(com.google.common.annotations.VisibleForTesting) KVSerde(org.apache.samza.serializers.KVSerde) RexShuttle(org.apache.calcite.rex.RexShuttle) SamzaSqlRelMessageSerdeFactory(org.apache.samza.sql.serializers.SamzaSqlRelMessageSerdeFactory) Collections(java.util.Collections) SqlExplainFormat(org.apache.calcite.sql.SqlExplainFormat) RexCall(org.apache.calcite.rex.RexCall) SamzaSqlRelRecordSerdeFactory(org.apache.samza.sql.serializers.SamzaSqlRelRecordSerdeFactory) KV(org.apache.samza.operators.KV) SamzaException(org.apache.samza.SamzaException) SamzaSqlRelMessageSerdeFactory(org.apache.samza.sql.serializers.SamzaSqlRelMessageSerdeFactory) SamzaSqlRelMessage(org.apache.samza.sql.data.SamzaSqlRelMessage)

Example 28 with MessageStream

use of org.apache.samza.operators.MessageStream in project samza by apache.

the class SystemConsumerWithSamzaBench method start.

public void start() throws IOException, InterruptedException {
    super.start();
    MessageConsumer consumeFn = new MessageConsumer();
    StreamApplication app = appDesc -> {
        String systemFactoryName = new SystemConfig(config).getSystemFactory(systemName).get();
        GenericSystemDescriptor sd = new GenericSystemDescriptor(systemName, systemFactoryName);
        GenericInputDescriptor<Object> isd = sd.getInputDescriptor(streamId, new NoOpSerde<>());
        MessageStream<Object> stream = appDesc.getInputStream(isd);
        stream.map(consumeFn);
    };
    ApplicationRunner runner = ApplicationRunners.getApplicationRunner(app, new MapConfig());
    runner.run();
    while (consumeFn.getEventsConsumed() < totalEvents) {
        Thread.sleep(10);
    }
    Instant endTime = Instant.now();
    runner.kill();
    System.out.println("\n*******************");
    System.out.println(String.format("Started at %s Ending at %s ", consumeFn.startTime, endTime));
    System.out.println(String.format("Event Rate is %s Messages/Sec ", consumeFn.getEventsConsumed() * 1000 / Duration.between(consumeFn.startTime, Instant.now()).toMillis()));
    System.out.println("Event Rate is " + consumeFn.getEventsConsumed() * 1000 / Duration.between(consumeFn.startTime, endTime).toMillis());
    System.out.println("*******************\n");
    System.exit(0);
}
Also used : IntStream(java.util.stream.IntStream) GenericSystemDescriptor(org.apache.samza.system.descriptors.GenericSystemDescriptor) JobConfig(org.apache.samza.config.JobConfig) GenericInputDescriptor(org.apache.samza.system.descriptors.GenericInputDescriptor) MapFunction(org.apache.samza.operators.functions.MapFunction) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Duration(java.time.Duration) ApplicationConfig(org.apache.samza.config.ApplicationConfig) SystemConfig(org.apache.samza.config.SystemConfig) ApplicationRunners(org.apache.samza.runtime.ApplicationRunners) MapConfig(org.apache.samza.config.MapConfig) NoOpSerde(org.apache.samza.serializers.NoOpSerde) MessageStream(org.apache.samza.operators.MessageStream) ApplicationRunner(org.apache.samza.runtime.ApplicationRunner) LocalApplicationRunner(org.apache.samza.runtime.LocalApplicationRunner) Properties(java.util.Properties) TaskConfig(org.apache.samza.config.TaskConfig) IOException(java.io.IOException) Instant(java.time.Instant) PassthroughJobCoordinatorFactory(org.apache.samza.standalone.PassthroughJobCoordinatorFactory) Collectors(java.util.stream.Collectors) List(java.util.List) ParseException(org.apache.commons.cli.ParseException) JobCoordinatorConfig(org.apache.samza.config.JobCoordinatorConfig) StreamApplication(org.apache.samza.application.StreamApplication) Joiner(com.google.common.base.Joiner) SystemConfig(org.apache.samza.config.SystemConfig) GenericInputDescriptor(org.apache.samza.system.descriptors.GenericInputDescriptor) ApplicationRunner(org.apache.samza.runtime.ApplicationRunner) LocalApplicationRunner(org.apache.samza.runtime.LocalApplicationRunner) StreamApplication(org.apache.samza.application.StreamApplication) NoOpSerde(org.apache.samza.serializers.NoOpSerde) MessageStream(org.apache.samza.operators.MessageStream) Instant(java.time.Instant) MapConfig(org.apache.samza.config.MapConfig) GenericSystemDescriptor(org.apache.samza.system.descriptors.GenericSystemDescriptor)

Example 29 with MessageStream

use of org.apache.samza.operators.MessageStream in project beam by apache.

the class ParDoBoundMultiTranslator method doTranslate.

// static for serializing anonymous functions
private static <InT, OutT> void doTranslate(ParDo.MultiOutput<InT, OutT> transform, TransformHierarchy.Node node, TranslationContext ctx) {
    final PCollection<? extends InT> input = ctx.getInput(transform);
    final Map<TupleTag<?>, Coder<?>> outputCoders = ctx.getCurrentTransform().getOutputs().entrySet().stream().filter(e -> e.getValue() instanceof PCollection).collect(Collectors.toMap(e -> e.getKey(), e -> ((PCollection<?>) e.getValue()).getCoder()));
    final Coder<?> keyCoder = StateUtils.isStateful(transform.getFn()) ? ((KvCoder<?, ?>) input.getCoder()).getKeyCoder() : null;
    if (DoFnSignatures.isSplittable(transform.getFn())) {
        throw new UnsupportedOperationException("Splittable DoFn is not currently supported");
    }
    if (DoFnSignatures.requiresTimeSortedInput(transform.getFn())) {
        throw new UnsupportedOperationException("@RequiresTimeSortedInput annotation is not currently supported");
    }
    final MessageStream<OpMessage<InT>> inputStream = ctx.getMessageStream(input);
    final List<MessageStream<OpMessage<InT>>> sideInputStreams = transform.getSideInputs().values().stream().map(ctx::<InT>getViewStream).collect(Collectors.toList());
    final ArrayList<Map.Entry<TupleTag<?>, PCollection<?>>> outputs = new ArrayList<>(node.getOutputs().entrySet());
    final Map<TupleTag<?>, Integer> tagToIndexMap = new HashMap<>();
    final Map<Integer, PCollection<?>> indexToPCollectionMap = new HashMap<>();
    for (int index = 0; index < outputs.size(); ++index) {
        final Map.Entry<TupleTag<?>, PCollection<?>> taggedOutput = outputs.get(index);
        tagToIndexMap.put(taggedOutput.getKey(), index);
        if (!(taggedOutput.getValue() instanceof PCollection)) {
            throw new IllegalArgumentException("Expected side output to be PCollection, but was: " + taggedOutput.getValue());
        }
        final PCollection<?> sideOutputCollection = taggedOutput.getValue();
        indexToPCollectionMap.put(index, sideOutputCollection);
    }
    final HashMap<String, PCollectionView<?>> idToPValueMap = new HashMap<>();
    for (PCollectionView<?> view : transform.getSideInputs().values()) {
        idToPValueMap.put(ctx.getViewId(view), view);
    }
    DoFnSchemaInformation doFnSchemaInformation;
    doFnSchemaInformation = ParDoTranslation.getSchemaInformation(ctx.getCurrentTransform());
    Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(ctx.getCurrentTransform());
    final DoFnOp<InT, OutT, RawUnionValue> op = new DoFnOp<>(transform.getMainOutputTag(), transform.getFn(), keyCoder, (Coder<InT>) input.getCoder(), null, outputCoders, transform.getSideInputs().values(), transform.getAdditionalOutputTags().getAll(), input.getWindowingStrategy(), idToPValueMap, new DoFnOp.MultiOutputManagerFactory(tagToIndexMap), ctx.getTransformFullName(), ctx.getTransformId(), input.isBounded(), false, null, null, Collections.emptyMap(), doFnSchemaInformation, sideInputMapping);
    final MessageStream<OpMessage<InT>> mergedStreams;
    if (sideInputStreams.isEmpty()) {
        mergedStreams = inputStream;
    } else {
        MessageStream<OpMessage<InT>> mergedSideInputStreams = MessageStream.mergeAll(sideInputStreams).flatMap(new SideInputWatermarkFn());
        mergedStreams = inputStream.merge(Collections.singletonList(mergedSideInputStreams));
    }
    final MessageStream<OpMessage<RawUnionValue>> taggedOutputStream = mergedStreams.flatMapAsync(OpAdapter.adapt(op));
    for (int outputIndex : tagToIndexMap.values()) {
        @SuppressWarnings("unchecked") final MessageStream<OpMessage<OutT>> outputStream = taggedOutputStream.filter(message -> message.getType() != OpMessage.Type.ELEMENT || message.getElement().getValue().getUnionTag() == outputIndex).flatMapAsync(OpAdapter.adapt(new RawUnionValueToValue()));
        ctx.registerMessageStream(indexToPCollectionMap.get(outputIndex), outputStream);
    }
}
Also used : WindowedValue(org.apache.beam.sdk.util.WindowedValue) PCollectionViews(org.apache.beam.sdk.values.PCollectionViews) OpMessage(org.apache.beam.runners.samza.runtime.OpMessage) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) RunnerPCollectionView(org.apache.beam.runners.core.construction.RunnerPCollectionView) WatermarkFunction(org.apache.samza.operators.functions.WatermarkFunction) DoFnSignatures(org.apache.beam.sdk.transforms.reflect.DoFnSignatures) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Map(java.util.Map) Iterators(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterators) StateUtils(org.apache.beam.runners.samza.util.StateUtils) KvCoder(org.apache.beam.sdk.coders.KvCoder) Collection(java.util.Collection) ServiceLoader(java.util.ServiceLoader) Collectors(java.util.stream.Collectors) TransformHierarchy(org.apache.beam.sdk.runners.TransformHierarchy) FlatMapFunction(org.apache.samza.operators.functions.FlatMapFunction) List(java.util.List) ParDo(org.apache.beam.sdk.transforms.ParDo) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) OpEmitter(org.apache.beam.runners.samza.runtime.OpEmitter) PipelineTranslatorUtils.instantiateCoder(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.instantiateCoder) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) SamzaPipelineTranslatorUtils(org.apache.beam.runners.samza.util.SamzaPipelineTranslatorUtils) KV(org.apache.beam.sdk.values.KV) TypeDescriptor(org.apache.beam.sdk.values.TypeDescriptor) OpAdapter(org.apache.beam.runners.samza.runtime.OpAdapter) DoFnOp(org.apache.beam.runners.samza.runtime.DoFnOp) Coder(org.apache.beam.sdk.coders.Coder) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) ViewFn(org.apache.beam.sdk.transforms.ViewFn) TupleTag(org.apache.beam.sdk.values.TupleTag) SamzaDoFnInvokerRegistrar(org.apache.beam.runners.samza.runtime.SamzaDoFnInvokerRegistrar) WindowUtils(org.apache.beam.runners.samza.util.WindowUtils) SideInputId(org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload.SideInputId) ParDoTranslation(org.apache.beam.runners.core.construction.ParDoTranslation) MessageStream(org.apache.samza.operators.MessageStream) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) DoFn(org.apache.beam.sdk.transforms.DoFn) QueryablePipeline(org.apache.beam.runners.core.construction.graph.QueryablePipeline) Op(org.apache.beam.runners.samza.runtime.Op) DoFnSignature(org.apache.beam.sdk.transforms.reflect.DoFnSignature) Iterator(java.util.Iterator) IterableCoder(org.apache.beam.sdk.coders.IterableCoder) IOException(java.io.IOException) PCollection(org.apache.beam.sdk.values.PCollection) SamzaPipelineOptions(org.apache.beam.runners.samza.SamzaPipelineOptions) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) Instant(org.joda.time.Instant) PipelineNode(org.apache.beam.runners.core.construction.graph.PipelineNode) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) Collections(java.util.Collections) OpMessage(org.apache.beam.runners.samza.runtime.OpMessage) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TupleTag(org.apache.beam.sdk.values.TupleTag) DoFnOp(org.apache.beam.runners.samza.runtime.DoFnOp) MessageStream(org.apache.samza.operators.MessageStream) KvCoder(org.apache.beam.sdk.coders.KvCoder) PipelineTranslatorUtils.instantiateCoder(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.instantiateCoder) Coder(org.apache.beam.sdk.coders.Coder) IterableCoder(org.apache.beam.sdk.coders.IterableCoder) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) PCollection(org.apache.beam.sdk.values.PCollection) RunnerPCollectionView(org.apache.beam.runners.core.construction.RunnerPCollectionView) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) Map(java.util.Map) HashMap(java.util.HashMap)

Example 30 with MessageStream

use of org.apache.samza.operators.MessageStream in project beam by apache.

the class ParDoBoundMultiTranslator method doTranslatePortable.

// static for serializing anonymous functions
private static <InT, OutT> void doTranslatePortable(PipelineNode.PTransformNode transform, QueryablePipeline pipeline, PortableTranslationContext ctx) {
    Map<String, String> outputs = transform.getTransform().getOutputsMap();
    final RunnerApi.ExecutableStagePayload stagePayload;
    try {
        stagePayload = RunnerApi.ExecutableStagePayload.parseFrom(transform.getTransform().getSpec().getPayload());
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    String inputId = stagePayload.getInput();
    final MessageStream<OpMessage<InT>> inputStream = ctx.getMessageStreamById(inputId);
    // Analyze side inputs
    final List<MessageStream<OpMessage<Iterable<?>>>> sideInputStreams = new ArrayList<>();
    final Map<SideInputId, PCollectionView<?>> sideInputMapping = new HashMap<>();
    final Map<String, PCollectionView<?>> idToViewMapping = new HashMap<>();
    final RunnerApi.Components components = stagePayload.getComponents();
    for (SideInputId sideInputId : stagePayload.getSideInputsList()) {
        final String sideInputCollectionId = components.getTransformsOrThrow(sideInputId.getTransformId()).getInputsOrThrow(sideInputId.getLocalName());
        final WindowingStrategy<?, BoundedWindow> windowingStrategy = WindowUtils.getWindowStrategy(sideInputCollectionId, components);
        final WindowedValue.WindowedValueCoder<?> coder = (WindowedValue.WindowedValueCoder) instantiateCoder(sideInputCollectionId, components);
        // Create a runner-side view
        final PCollectionView<?> view = createPCollectionView(sideInputId, coder, windowingStrategy);
        // Use GBK to aggregate the side inputs and then broadcast it out
        final MessageStream<OpMessage<Iterable<?>>> broadcastSideInput = groupAndBroadcastSideInput(sideInputId, sideInputCollectionId, components.getPcollectionsOrThrow(sideInputCollectionId), (WindowingStrategy) windowingStrategy, coder, ctx);
        sideInputStreams.add(broadcastSideInput);
        sideInputMapping.put(sideInputId, view);
        idToViewMapping.put(getSideInputUniqueId(sideInputId), view);
    }
    final Map<TupleTag<?>, Integer> tagToIndexMap = new HashMap<>();
    final Map<Integer, String> indexToIdMap = new HashMap<>();
    final Map<String, TupleTag<?>> idToTupleTagMap = new HashMap<>();
    // first output as the main output
    final TupleTag<OutT> mainOutputTag = outputs.isEmpty() ? null : new TupleTag(outputs.keySet().iterator().next());
    AtomicInteger index = new AtomicInteger(0);
    outputs.keySet().iterator().forEachRemaining(outputName -> {
        TupleTag<?> tupleTag = new TupleTag<>(outputName);
        tagToIndexMap.put(tupleTag, index.get());
        String collectionId = outputs.get(outputName);
        indexToIdMap.put(index.get(), collectionId);
        idToTupleTagMap.put(collectionId, tupleTag);
        index.incrementAndGet();
    });
    WindowedValue.WindowedValueCoder<InT> windowedInputCoder = WindowUtils.instantiateWindowedCoder(inputId, pipeline.getComponents());
    // TODO: support schema and side inputs for portable runner
    // Note: transform.getTransform() is an ExecutableStage, not ParDo, so we need to extract
    // these info from its components.
    final DoFnSchemaInformation doFnSchemaInformation = null;
    final RunnerApi.PCollection input = pipeline.getComponents().getPcollectionsOrThrow(inputId);
    final PCollection.IsBounded isBounded = SamzaPipelineTranslatorUtils.isBounded(input);
    final Coder<?> keyCoder = StateUtils.isStateful(stagePayload) ? ((KvCoder) ((WindowedValue.FullWindowedValueCoder) windowedInputCoder).getValueCoder()).getKeyCoder() : null;
    final DoFnOp<InT, OutT, RawUnionValue> op = new DoFnOp<>(mainOutputTag, new NoOpDoFn<>(), keyCoder, // input coder not in use
    windowedInputCoder.getValueCoder(), windowedInputCoder, // output coders not in use
    Collections.emptyMap(), new ArrayList<>(sideInputMapping.values()), // used by java runner only
    new ArrayList<>(idToTupleTagMap.values()), WindowUtils.getWindowStrategy(inputId, stagePayload.getComponents()), idToViewMapping, new DoFnOp.MultiOutputManagerFactory(tagToIndexMap), ctx.getTransformFullName(), ctx.getTransformId(), isBounded, true, stagePayload, ctx.getJobInfo(), idToTupleTagMap, doFnSchemaInformation, sideInputMapping);
    final MessageStream<OpMessage<InT>> mergedStreams;
    if (sideInputStreams.isEmpty()) {
        mergedStreams = inputStream;
    } else {
        MessageStream<OpMessage<InT>> mergedSideInputStreams = MessageStream.mergeAll(sideInputStreams).flatMap(new SideInputWatermarkFn());
        mergedStreams = inputStream.merge(Collections.singletonList(mergedSideInputStreams));
    }
    final MessageStream<OpMessage<RawUnionValue>> taggedOutputStream = mergedStreams.flatMapAsync(OpAdapter.adapt(op));
    for (int outputIndex : tagToIndexMap.values()) {
        @SuppressWarnings("unchecked") final MessageStream<OpMessage<OutT>> outputStream = taggedOutputStream.filter(message -> message.getType() != OpMessage.Type.ELEMENT || message.getElement().getValue().getUnionTag() == outputIndex).flatMapAsync(OpAdapter.adapt(new RawUnionValueToValue()));
        ctx.registerMessageStream(indexToIdMap.get(outputIndex), outputStream);
    }
}
Also used : WindowedValue(org.apache.beam.sdk.util.WindowedValue) PCollectionViews(org.apache.beam.sdk.values.PCollectionViews) OpMessage(org.apache.beam.runners.samza.runtime.OpMessage) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) RunnerPCollectionView(org.apache.beam.runners.core.construction.RunnerPCollectionView) WatermarkFunction(org.apache.samza.operators.functions.WatermarkFunction) DoFnSignatures(org.apache.beam.sdk.transforms.reflect.DoFnSignatures) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Map(java.util.Map) Iterators(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterators) StateUtils(org.apache.beam.runners.samza.util.StateUtils) KvCoder(org.apache.beam.sdk.coders.KvCoder) Collection(java.util.Collection) ServiceLoader(java.util.ServiceLoader) Collectors(java.util.stream.Collectors) TransformHierarchy(org.apache.beam.sdk.runners.TransformHierarchy) FlatMapFunction(org.apache.samza.operators.functions.FlatMapFunction) List(java.util.List) ParDo(org.apache.beam.sdk.transforms.ParDo) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) OpEmitter(org.apache.beam.runners.samza.runtime.OpEmitter) PipelineTranslatorUtils.instantiateCoder(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.instantiateCoder) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) SamzaPipelineTranslatorUtils(org.apache.beam.runners.samza.util.SamzaPipelineTranslatorUtils) KV(org.apache.beam.sdk.values.KV) TypeDescriptor(org.apache.beam.sdk.values.TypeDescriptor) OpAdapter(org.apache.beam.runners.samza.runtime.OpAdapter) DoFnOp(org.apache.beam.runners.samza.runtime.DoFnOp) Coder(org.apache.beam.sdk.coders.Coder) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) ViewFn(org.apache.beam.sdk.transforms.ViewFn) TupleTag(org.apache.beam.sdk.values.TupleTag) SamzaDoFnInvokerRegistrar(org.apache.beam.runners.samza.runtime.SamzaDoFnInvokerRegistrar) WindowUtils(org.apache.beam.runners.samza.util.WindowUtils) SideInputId(org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload.SideInputId) ParDoTranslation(org.apache.beam.runners.core.construction.ParDoTranslation) MessageStream(org.apache.samza.operators.MessageStream) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) DoFn(org.apache.beam.sdk.transforms.DoFn) QueryablePipeline(org.apache.beam.runners.core.construction.graph.QueryablePipeline) Op(org.apache.beam.runners.samza.runtime.Op) DoFnSignature(org.apache.beam.sdk.transforms.reflect.DoFnSignature) Iterator(java.util.Iterator) IterableCoder(org.apache.beam.sdk.coders.IterableCoder) IOException(java.io.IOException) PCollection(org.apache.beam.sdk.values.PCollection) SamzaPipelineOptions(org.apache.beam.runners.samza.SamzaPipelineOptions) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) Instant(org.joda.time.Instant) PipelineNode(org.apache.beam.runners.core.construction.graph.PipelineNode) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) Collections(java.util.Collections) OpMessage(org.apache.beam.runners.samza.runtime.OpMessage) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TupleTag(org.apache.beam.sdk.values.TupleTag) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) WindowedValue(org.apache.beam.sdk.util.WindowedValue) MessageStream(org.apache.samza.operators.MessageStream) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) SideInputId(org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload.SideInputId) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) RunnerPCollectionView(org.apache.beam.runners.core.construction.RunnerPCollectionView) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) DoFnOp(org.apache.beam.runners.samza.runtime.DoFnOp) IOException(java.io.IOException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) PCollection(org.apache.beam.sdk.values.PCollection)

Aggregations

MessageStream (org.apache.samza.operators.MessageStream)34 Config (org.apache.samza.config.Config)22 KVSerde (org.apache.samza.serializers.KVSerde)21 Duration (java.time.Duration)19 HashMap (java.util.HashMap)19 OutputStream (org.apache.samza.operators.OutputStream)19 KV (org.apache.samza.operators.KV)18 Map (java.util.Map)17 ArrayList (java.util.ArrayList)16 List (java.util.List)16 StringSerde (org.apache.samza.serializers.StringSerde)16 Test (org.junit.Test)16 Collection (java.util.Collection)14 StreamApplicationDescriptorImpl (org.apache.samza.application.descriptors.StreamApplicationDescriptorImpl)14 JobConfig (org.apache.samza.config.JobConfig)14 MapConfig (org.apache.samza.config.MapConfig)14 Windows (org.apache.samza.operators.windows.Windows)13 Collections (java.util.Collections)12 JoinFunction (org.apache.samza.operators.functions.JoinFunction)12 ApplicationRunner (org.apache.samza.runtime.ApplicationRunner)12