Search in sources :

Example 6 with DoFnSchemaInformation

use of org.apache.beam.sdk.transforms.DoFnSchemaInformation in project beam by apache.

the class BeamCalcRelTest method testSingleFieldAccess.

@Test
public void testSingleFieldAccess() throws IllegalAccessException {
    String sql = "SELECT order_id FROM ORDER_DETAILS_BOUNDED";
    PCollection<Row> rows = compilePipeline(sql, pipeline);
    final NodeGetter nodeGetter = new NodeGetter(rows);
    pipeline.traverseTopologically(nodeGetter);
    ParDo.MultiOutput<Row, Row> pardo = (ParDo.MultiOutput<Row, Row>) nodeGetter.producer.getTransform();
    PCollection<Row> input = (PCollection<Row>) Iterables.getOnlyElement(nodeGetter.producer.getInputs().values());
    DoFnSchemaInformation info = ParDo.getDoFnSchemaInformation(pardo.getFn(), input);
    FieldAccessDescriptor fieldAccess = info.getFieldAccessDescriptor();
    Assert.assertTrue(fieldAccess.referencesSingleField());
    Assert.assertEquals("order_id", Iterables.getOnlyElement(fieldAccess.fieldNamesAccessed()));
    pipeline.run().waitUntilFinish();
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) FieldAccessDescriptor(org.apache.beam.sdk.schemas.FieldAccessDescriptor) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) ParDo(org.apache.beam.sdk.transforms.ParDo) Row(org.apache.beam.sdk.values.Row) Test(org.junit.Test)

Example 7 with DoFnSchemaInformation

use of org.apache.beam.sdk.transforms.DoFnSchemaInformation in project beam by apache.

the class DoFnOperator method open.

@Override
public void open() throws Exception {
    // WindowDoFnOperator need use state and timer to get DoFn.
    // So must wait StateInternals and TimerInternals ready.
    // This will be called after initializeState()
    this.doFn = getDoFn();
    FlinkPipelineOptions options = serializedOptions.get().as(FlinkPipelineOptions.class);
    doFnInvoker = DoFnInvokers.tryInvokeSetupFor(doFn, options);
    StepContext stepContext = new FlinkStepContext();
    doFnRunner = DoFnRunners.simpleRunner(options, doFn, sideInputReader, outputManager, mainOutputTag, additionalOutputTags, stepContext, getInputCoder(), outputCoders, windowingStrategy, doFnSchemaInformation, sideInputMapping);
    if (requiresStableInput) {
        // put this in front of the root FnRunner before any additional wrappers
        doFnRunner = bufferingDoFnRunner = BufferingDoFnRunner.create(doFnRunner, "stable-input-buffer", windowedInputCoder, windowingStrategy.getWindowFn().windowCoder(), getOperatorStateBackend(), getKeyedStateBackend(), options.getNumConcurrentCheckpoints(), serializedOptions);
    }
    doFnRunner = createWrappingDoFnRunner(doFnRunner, stepContext);
    earlyBindStateIfNeeded();
    if (!options.getDisableMetrics()) {
        flinkMetricContainer = new FlinkMetricContainer(getRuntimeContext());
        doFnRunner = new DoFnRunnerWithMetricsUpdate<>(stepName, doFnRunner, flinkMetricContainer);
        String checkpointMetricNamespace = options.getReportCheckpointDuration();
        if (checkpointMetricNamespace != null) {
            MetricName checkpointMetric = MetricName.named(checkpointMetricNamespace, "checkpoint_duration");
            checkpointStats = new CheckpointStats(() -> flinkMetricContainer.getMetricsContainer(stepName).getDistribution(checkpointMetric));
        }
    }
    elementCount = 0L;
    lastFinishBundleTime = getProcessingTimeService().getCurrentProcessingTime();
    // Schedule timer to check timeout of finish bundle.
    long bundleCheckPeriod = Math.max(maxBundleTimeMills / 2, 1);
    checkFinishBundleTimer = getProcessingTimeService().scheduleAtFixedRate(timestamp -> checkInvokeFinishBundleByTime(), bundleCheckPeriod, bundleCheckPeriod);
    if (doFn instanceof SplittableParDoViaKeyedWorkItems.ProcessFn) {
        pushbackDoFnRunner = new ProcessFnRunner<>((DoFnRunner) doFnRunner, sideInputs, sideInputHandler);
    } else {
        pushbackDoFnRunner = SimplePushbackSideInputDoFnRunner.create(doFnRunner, sideInputs, sideInputHandler);
    }
    bundleFinalizer = new InMemoryBundleFinalizer();
    pendingFinalizations = new LinkedHashMap<>();
}
Also used : MetricName(org.apache.beam.sdk.metrics.MetricName) InternalTimeServiceManager(org.apache.flink.streaming.api.operators.InternalTimeServiceManager) FlinkMetricContainer(org.apache.beam.runners.flink.metrics.FlinkMetricContainer) Joiner(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Joiner) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) TimerInternals(org.apache.beam.runners.core.TimerInternals) DoFnSignatures(org.apache.beam.sdk.transforms.reflect.DoFnSignatures) Map(java.util.Map) InternalTimerService(org.apache.flink.streaming.api.operators.InternalTimerService) GlobalWindow(org.apache.beam.sdk.transforms.windowing.GlobalWindow) OperatorStateBackend(org.apache.flink.runtime.state.OperatorStateBackend) FlinkBroadcastStateInternals(org.apache.beam.runners.flink.translation.wrappers.streaming.state.FlinkBroadcastStateInternals) StateSnapshotContext(org.apache.flink.runtime.state.StateSnapshotContext) InternalTimer(org.apache.flink.streaming.api.operators.InternalTimer) OutputTag(org.apache.flink.util.OutputTag) Serializable(java.io.Serializable) Workarounds(org.apache.beam.runners.flink.translation.utils.Workarounds) Stream(java.util.stream.Stream) StructuredCoder(org.apache.beam.sdk.coders.StructuredCoder) DoFnInvokers(org.apache.beam.sdk.transforms.reflect.DoFnInvokers) OneInputStreamOperator(org.apache.flink.streaming.api.operators.OneInputStreamOperator) StatefulDoFnRunner(org.apache.beam.runners.core.StatefulDoFnRunner) VoidNamespace(org.apache.flink.runtime.state.VoidNamespace) KV(org.apache.beam.sdk.values.KV) PushbackSideInputDoFnRunner(org.apache.beam.runners.core.PushbackSideInputDoFnRunner) BundleFinalizer(org.apache.beam.sdk.transforms.DoFn.BundleFinalizer) MapStateDescriptor(org.apache.flink.api.common.state.MapStateDescriptor) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) InternalPriorityQueue(org.apache.flink.runtime.state.InternalPriorityQueue) CoderTypeSerializer(org.apache.beam.runners.flink.translation.types.CoderTypeSerializer) TupleTag(org.apache.beam.sdk.values.TupleTag) Output(org.apache.flink.streaming.api.operators.Output) StateInternals(org.apache.beam.runners.core.StateInternals) SideInputReader(org.apache.beam.runners.core.SideInputReader) DoFn(org.apache.beam.sdk.transforms.DoFn) TwoInputStreamOperator(org.apache.flink.streaming.api.operators.TwoInputStreamOperator) WindowNamespace(org.apache.beam.runners.core.StateNamespaces.WindowNamespace) NullSideInputReader(org.apache.beam.runners.core.NullSideInputReader) IOException(java.io.IOException) VisibleForTesting(org.apache.flink.annotation.VisibleForTesting) NoopLock(org.apache.beam.sdk.util.NoopLock) Lock(java.util.concurrent.locks.Lock) MapState(org.apache.flink.api.common.state.MapState) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) VarIntCoder(org.apache.beam.sdk.coders.VarIntCoder) FileSystems(org.apache.beam.sdk.io.FileSystems) TimeDomain(org.apache.beam.sdk.state.TimeDomain) SplittableParDoViaKeyedWorkItems(org.apache.beam.runners.core.SplittableParDoViaKeyedWorkItems) StateSpec(org.apache.beam.sdk.state.StateSpec) ScheduledFuture(java.util.concurrent.ScheduledFuture) StateNamespace(org.apache.beam.runners.core.StateNamespace) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) WindowedValue(org.apache.beam.sdk.util.WindowedValue) FlinkPipelineOptions(org.apache.beam.runners.flink.FlinkPipelineOptions) DoFnRunner(org.apache.beam.runners.core.DoFnRunner) CheckpointingMode(org.apache.flink.streaming.api.CheckpointingMode) LoggerFactory(org.slf4j.LoggerFactory) StepContext(org.apache.beam.runners.core.StepContext) StringSerializer(org.apache.flink.api.common.typeutils.base.StringSerializer) DoFnRunners(org.apache.beam.runners.core.DoFnRunners) ByteBuffer(java.nio.ByteBuffer) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) ListState(org.apache.flink.api.common.state.ListState) ChainingStrategy(org.apache.flink.streaming.api.operators.ChainingStrategy) CheckpointStats(org.apache.beam.runners.flink.translation.utils.CheckpointStats) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) ListStateDescriptor(org.apache.flink.api.common.state.ListStateDescriptor) DoFnInvoker(org.apache.beam.sdk.transforms.reflect.DoFnInvoker) KeySelector(org.apache.flink.api.java.functions.KeySelector) StreamTask(org.apache.flink.streaming.runtime.tasks.StreamTask) Collection(java.util.Collection) Collectors(java.util.stream.Collectors) List(java.util.List) Preconditions.checkArgument(org.apache.flink.util.Preconditions.checkArgument) Optional(java.util.Optional) SuppressFBWarnings(edu.umd.cs.findbugs.annotations.SuppressFBWarnings) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) StreamConfig(org.apache.flink.streaming.api.graph.StreamConfig) StateAndTimerBundleCheckpointHandler(org.apache.beam.runners.fnexecution.control.BundleCheckpointHandlers.StateAndTimerBundleCheckpointHandler) Coder(org.apache.beam.sdk.coders.Coder) Watermark(org.apache.flink.streaming.api.watermark.Watermark) HashMap(java.util.HashMap) ProcessFnRunner(org.apache.beam.runners.core.ProcessFnRunner) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) StreamRecord(org.apache.flink.streaming.runtime.streamrecord.StreamRecord) SideInputHandler(org.apache.beam.runners.core.SideInputHandler) FlinkStateInternals(org.apache.beam.runners.flink.translation.wrappers.streaming.state.FlinkStateInternals) TimerData(org.apache.beam.runners.core.TimerInternals.TimerData) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Nullable(org.checkerframework.checker.nullness.qual.Nullable) DoFnRunnerWithMetricsUpdate(org.apache.beam.runners.flink.metrics.DoFnRunnerWithMetricsUpdate) OutputStream(java.io.OutputStream) DoFnSignature(org.apache.beam.sdk.transforms.reflect.DoFnSignature) Triggerable(org.apache.flink.streaming.api.operators.Triggerable) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) KeyedStateBackend(org.apache.flink.runtime.state.KeyedStateBackend) SimplePushbackSideInputDoFnRunner(org.apache.beam.runners.core.SimplePushbackSideInputDoFnRunner) InMemoryBundleFinalizer(org.apache.beam.runners.core.InMemoryBundleFinalizer) Preconditions(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions) Instant(org.joda.time.Instant) BufferingDoFnRunner(org.apache.beam.runners.flink.translation.wrappers.streaming.stableinput.BufferingDoFnRunner) InputStream(java.io.InputStream) StateInitializationContext(org.apache.flink.runtime.state.StateInitializationContext) StepContext(org.apache.beam.runners.core.StepContext) CheckpointStats(org.apache.beam.runners.flink.translation.utils.CheckpointStats) FlinkPipelineOptions(org.apache.beam.runners.flink.FlinkPipelineOptions) MetricName(org.apache.beam.sdk.metrics.MetricName) InMemoryBundleFinalizer(org.apache.beam.runners.core.InMemoryBundleFinalizer) StatefulDoFnRunner(org.apache.beam.runners.core.StatefulDoFnRunner) PushbackSideInputDoFnRunner(org.apache.beam.runners.core.PushbackSideInputDoFnRunner) DoFnRunner(org.apache.beam.runners.core.DoFnRunner) SimplePushbackSideInputDoFnRunner(org.apache.beam.runners.core.SimplePushbackSideInputDoFnRunner) BufferingDoFnRunner(org.apache.beam.runners.flink.translation.wrappers.streaming.stableinput.BufferingDoFnRunner) FlinkMetricContainer(org.apache.beam.runners.flink.metrics.FlinkMetricContainer)

Example 8 with DoFnSchemaInformation

use of org.apache.beam.sdk.transforms.DoFnSchemaInformation in project beam by apache.

the class ParDoEvaluator method create.

public static <InputT, OutputT> ParDoEvaluator<InputT> create(EvaluationContext evaluationContext, PipelineOptions options, DirectStepContext stepContext, AppliedPTransform<?, ?, ?> application, Coder<InputT> inputCoder, WindowingStrategy<?, ? extends BoundedWindow> windowingStrategy, DoFn<InputT, OutputT> fn, StructuralKey<?> key, List<PCollectionView<?>> sideInputs, TupleTag<OutputT> mainOutputTag, List<TupleTag<?>> additionalOutputTags, Map<TupleTag<?>, PCollection<?>> outputs, DoFnSchemaInformation doFnSchemaInformation, Map<String, PCollectionView<?>> sideInputMapping, DoFnRunnerFactory<InputT, OutputT> runnerFactory) {
    BundleOutputManager outputManager = createOutputManager(evaluationContext, key, outputs);
    ReadyCheckingSideInputReader sideInputReader = evaluationContext.createSideInputReader(sideInputs);
    Map<TupleTag<?>, Coder<?>> outputCoders = outputs.entrySet().stream().collect(Collectors.toMap(e -> e.getKey(), e -> e.getValue().getCoder()));
    PushbackSideInputDoFnRunner<InputT, OutputT> runner = runnerFactory.createRunner(options, fn, sideInputs, sideInputReader, outputManager, mainOutputTag, additionalOutputTags, stepContext, inputCoder, outputCoders, windowingStrategy, doFnSchemaInformation, sideInputMapping);
    return create(runner, stepContext, application, outputManager);
}
Also used : StatefulDoFnRunner(org.apache.beam.runners.core.StatefulDoFnRunner) UserCodeException(org.apache.beam.sdk.util.UserCodeException) PushbackSideInputDoFnRunner(org.apache.beam.runners.core.PushbackSideInputDoFnRunner) WindowedValue(org.apache.beam.sdk.util.WindowedValue) KeyedWorkItemCoder(org.apache.beam.runners.core.KeyedWorkItemCoder) DoFnRunner(org.apache.beam.runners.core.DoFnRunner) Coder(org.apache.beam.sdk.coders.Coder) HashMap(java.util.HashMap) DoFnRunners(org.apache.beam.runners.core.DoFnRunners) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) DoFnSignatures(org.apache.beam.sdk.transforms.reflect.DoFnSignatures) TupleTag(org.apache.beam.sdk.values.TupleTag) Map(java.util.Map) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) TimerData(org.apache.beam.runners.core.TimerInternals.TimerData) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) AppliedPTransform(org.apache.beam.sdk.runners.AppliedPTransform) StructuralKey(org.apache.beam.runners.local.StructuralKey) DoFn(org.apache.beam.sdk.transforms.DoFn) ReadyCheckingSideInputReader(org.apache.beam.runners.core.ReadyCheckingSideInputReader) PCollection(org.apache.beam.sdk.values.PCollection) Collectors(java.util.stream.Collectors) List(java.util.List) DirectStepContext(org.apache.beam.runners.direct.DirectExecutionContext.DirectStepContext) SimplePushbackSideInputDoFnRunner(org.apache.beam.runners.core.SimplePushbackSideInputDoFnRunner) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) OutputManager(org.apache.beam.runners.core.DoFnRunners.OutputManager) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) KeyedWorkItemCoder(org.apache.beam.runners.core.KeyedWorkItemCoder) Coder(org.apache.beam.sdk.coders.Coder) TupleTag(org.apache.beam.sdk.values.TupleTag) ReadyCheckingSideInputReader(org.apache.beam.runners.core.ReadyCheckingSideInputReader)

Example 9 with DoFnSchemaInformation

use of org.apache.beam.sdk.transforms.DoFnSchemaInformation in project beam by apache.

the class ParDoBoundMultiTranslator method doTranslate.

// static for serializing anonymous functions
private static <InT, OutT> void doTranslate(ParDo.MultiOutput<InT, OutT> transform, TransformHierarchy.Node node, TranslationContext ctx) {
    final PCollection<? extends InT> input = ctx.getInput(transform);
    final Map<TupleTag<?>, Coder<?>> outputCoders = ctx.getCurrentTransform().getOutputs().entrySet().stream().filter(e -> e.getValue() instanceof PCollection).collect(Collectors.toMap(e -> e.getKey(), e -> ((PCollection<?>) e.getValue()).getCoder()));
    final Coder<?> keyCoder = StateUtils.isStateful(transform.getFn()) ? ((KvCoder<?, ?>) input.getCoder()).getKeyCoder() : null;
    if (DoFnSignatures.isSplittable(transform.getFn())) {
        throw new UnsupportedOperationException("Splittable DoFn is not currently supported");
    }
    if (DoFnSignatures.requiresTimeSortedInput(transform.getFn())) {
        throw new UnsupportedOperationException("@RequiresTimeSortedInput annotation is not currently supported");
    }
    final MessageStream<OpMessage<InT>> inputStream = ctx.getMessageStream(input);
    final List<MessageStream<OpMessage<InT>>> sideInputStreams = transform.getSideInputs().values().stream().map(ctx::<InT>getViewStream).collect(Collectors.toList());
    final ArrayList<Map.Entry<TupleTag<?>, PCollection<?>>> outputs = new ArrayList<>(node.getOutputs().entrySet());
    final Map<TupleTag<?>, Integer> tagToIndexMap = new HashMap<>();
    final Map<Integer, PCollection<?>> indexToPCollectionMap = new HashMap<>();
    for (int index = 0; index < outputs.size(); ++index) {
        final Map.Entry<TupleTag<?>, PCollection<?>> taggedOutput = outputs.get(index);
        tagToIndexMap.put(taggedOutput.getKey(), index);
        if (!(taggedOutput.getValue() instanceof PCollection)) {
            throw new IllegalArgumentException("Expected side output to be PCollection, but was: " + taggedOutput.getValue());
        }
        final PCollection<?> sideOutputCollection = taggedOutput.getValue();
        indexToPCollectionMap.put(index, sideOutputCollection);
    }
    final HashMap<String, PCollectionView<?>> idToPValueMap = new HashMap<>();
    for (PCollectionView<?> view : transform.getSideInputs().values()) {
        idToPValueMap.put(ctx.getViewId(view), view);
    }
    DoFnSchemaInformation doFnSchemaInformation;
    doFnSchemaInformation = ParDoTranslation.getSchemaInformation(ctx.getCurrentTransform());
    Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(ctx.getCurrentTransform());
    final DoFnOp<InT, OutT, RawUnionValue> op = new DoFnOp<>(transform.getMainOutputTag(), transform.getFn(), keyCoder, (Coder<InT>) input.getCoder(), null, outputCoders, transform.getSideInputs().values(), transform.getAdditionalOutputTags().getAll(), input.getWindowingStrategy(), idToPValueMap, new DoFnOp.MultiOutputManagerFactory(tagToIndexMap), ctx.getTransformFullName(), ctx.getTransformId(), input.isBounded(), false, null, null, Collections.emptyMap(), doFnSchemaInformation, sideInputMapping);
    final MessageStream<OpMessage<InT>> mergedStreams;
    if (sideInputStreams.isEmpty()) {
        mergedStreams = inputStream;
    } else {
        MessageStream<OpMessage<InT>> mergedSideInputStreams = MessageStream.mergeAll(sideInputStreams).flatMap(new SideInputWatermarkFn());
        mergedStreams = inputStream.merge(Collections.singletonList(mergedSideInputStreams));
    }
    final MessageStream<OpMessage<RawUnionValue>> taggedOutputStream = mergedStreams.flatMapAsync(OpAdapter.adapt(op));
    for (int outputIndex : tagToIndexMap.values()) {
        @SuppressWarnings("unchecked") final MessageStream<OpMessage<OutT>> outputStream = taggedOutputStream.filter(message -> message.getType() != OpMessage.Type.ELEMENT || message.getElement().getValue().getUnionTag() == outputIndex).flatMapAsync(OpAdapter.adapt(new RawUnionValueToValue()));
        ctx.registerMessageStream(indexToPCollectionMap.get(outputIndex), outputStream);
    }
}
Also used : WindowedValue(org.apache.beam.sdk.util.WindowedValue) PCollectionViews(org.apache.beam.sdk.values.PCollectionViews) OpMessage(org.apache.beam.runners.samza.runtime.OpMessage) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) RunnerPCollectionView(org.apache.beam.runners.core.construction.RunnerPCollectionView) WatermarkFunction(org.apache.samza.operators.functions.WatermarkFunction) DoFnSignatures(org.apache.beam.sdk.transforms.reflect.DoFnSignatures) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Map(java.util.Map) Iterators(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterators) StateUtils(org.apache.beam.runners.samza.util.StateUtils) KvCoder(org.apache.beam.sdk.coders.KvCoder) Collection(java.util.Collection) ServiceLoader(java.util.ServiceLoader) Collectors(java.util.stream.Collectors) TransformHierarchy(org.apache.beam.sdk.runners.TransformHierarchy) FlatMapFunction(org.apache.samza.operators.functions.FlatMapFunction) List(java.util.List) ParDo(org.apache.beam.sdk.transforms.ParDo) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) OpEmitter(org.apache.beam.runners.samza.runtime.OpEmitter) PipelineTranslatorUtils.instantiateCoder(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.instantiateCoder) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) SamzaPipelineTranslatorUtils(org.apache.beam.runners.samza.util.SamzaPipelineTranslatorUtils) KV(org.apache.beam.sdk.values.KV) TypeDescriptor(org.apache.beam.sdk.values.TypeDescriptor) OpAdapter(org.apache.beam.runners.samza.runtime.OpAdapter) DoFnOp(org.apache.beam.runners.samza.runtime.DoFnOp) Coder(org.apache.beam.sdk.coders.Coder) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) ViewFn(org.apache.beam.sdk.transforms.ViewFn) TupleTag(org.apache.beam.sdk.values.TupleTag) SamzaDoFnInvokerRegistrar(org.apache.beam.runners.samza.runtime.SamzaDoFnInvokerRegistrar) WindowUtils(org.apache.beam.runners.samza.util.WindowUtils) SideInputId(org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload.SideInputId) ParDoTranslation(org.apache.beam.runners.core.construction.ParDoTranslation) MessageStream(org.apache.samza.operators.MessageStream) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) DoFn(org.apache.beam.sdk.transforms.DoFn) QueryablePipeline(org.apache.beam.runners.core.construction.graph.QueryablePipeline) Op(org.apache.beam.runners.samza.runtime.Op) DoFnSignature(org.apache.beam.sdk.transforms.reflect.DoFnSignature) Iterator(java.util.Iterator) IterableCoder(org.apache.beam.sdk.coders.IterableCoder) IOException(java.io.IOException) PCollection(org.apache.beam.sdk.values.PCollection) SamzaPipelineOptions(org.apache.beam.runners.samza.SamzaPipelineOptions) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) Instant(org.joda.time.Instant) PipelineNode(org.apache.beam.runners.core.construction.graph.PipelineNode) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) Collections(java.util.Collections) OpMessage(org.apache.beam.runners.samza.runtime.OpMessage) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TupleTag(org.apache.beam.sdk.values.TupleTag) DoFnOp(org.apache.beam.runners.samza.runtime.DoFnOp) MessageStream(org.apache.samza.operators.MessageStream) KvCoder(org.apache.beam.sdk.coders.KvCoder) PipelineTranslatorUtils.instantiateCoder(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.instantiateCoder) Coder(org.apache.beam.sdk.coders.Coder) IterableCoder(org.apache.beam.sdk.coders.IterableCoder) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) PCollection(org.apache.beam.sdk.values.PCollection) RunnerPCollectionView(org.apache.beam.runners.core.construction.RunnerPCollectionView) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) Map(java.util.Map) HashMap(java.util.HashMap)

Example 10 with DoFnSchemaInformation

use of org.apache.beam.sdk.transforms.DoFnSchemaInformation in project beam by apache.

the class ParDoBoundMultiTranslator method doTranslatePortable.

// static for serializing anonymous functions
private static <InT, OutT> void doTranslatePortable(PipelineNode.PTransformNode transform, QueryablePipeline pipeline, PortableTranslationContext ctx) {
    Map<String, String> outputs = transform.getTransform().getOutputsMap();
    final RunnerApi.ExecutableStagePayload stagePayload;
    try {
        stagePayload = RunnerApi.ExecutableStagePayload.parseFrom(transform.getTransform().getSpec().getPayload());
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    String inputId = stagePayload.getInput();
    final MessageStream<OpMessage<InT>> inputStream = ctx.getMessageStreamById(inputId);
    // Analyze side inputs
    final List<MessageStream<OpMessage<Iterable<?>>>> sideInputStreams = new ArrayList<>();
    final Map<SideInputId, PCollectionView<?>> sideInputMapping = new HashMap<>();
    final Map<String, PCollectionView<?>> idToViewMapping = new HashMap<>();
    final RunnerApi.Components components = stagePayload.getComponents();
    for (SideInputId sideInputId : stagePayload.getSideInputsList()) {
        final String sideInputCollectionId = components.getTransformsOrThrow(sideInputId.getTransformId()).getInputsOrThrow(sideInputId.getLocalName());
        final WindowingStrategy<?, BoundedWindow> windowingStrategy = WindowUtils.getWindowStrategy(sideInputCollectionId, components);
        final WindowedValue.WindowedValueCoder<?> coder = (WindowedValue.WindowedValueCoder) instantiateCoder(sideInputCollectionId, components);
        // Create a runner-side view
        final PCollectionView<?> view = createPCollectionView(sideInputId, coder, windowingStrategy);
        // Use GBK to aggregate the side inputs and then broadcast it out
        final MessageStream<OpMessage<Iterable<?>>> broadcastSideInput = groupAndBroadcastSideInput(sideInputId, sideInputCollectionId, components.getPcollectionsOrThrow(sideInputCollectionId), (WindowingStrategy) windowingStrategy, coder, ctx);
        sideInputStreams.add(broadcastSideInput);
        sideInputMapping.put(sideInputId, view);
        idToViewMapping.put(getSideInputUniqueId(sideInputId), view);
    }
    final Map<TupleTag<?>, Integer> tagToIndexMap = new HashMap<>();
    final Map<Integer, String> indexToIdMap = new HashMap<>();
    final Map<String, TupleTag<?>> idToTupleTagMap = new HashMap<>();
    // first output as the main output
    final TupleTag<OutT> mainOutputTag = outputs.isEmpty() ? null : new TupleTag(outputs.keySet().iterator().next());
    AtomicInteger index = new AtomicInteger(0);
    outputs.keySet().iterator().forEachRemaining(outputName -> {
        TupleTag<?> tupleTag = new TupleTag<>(outputName);
        tagToIndexMap.put(tupleTag, index.get());
        String collectionId = outputs.get(outputName);
        indexToIdMap.put(index.get(), collectionId);
        idToTupleTagMap.put(collectionId, tupleTag);
        index.incrementAndGet();
    });
    WindowedValue.WindowedValueCoder<InT> windowedInputCoder = WindowUtils.instantiateWindowedCoder(inputId, pipeline.getComponents());
    // TODO: support schema and side inputs for portable runner
    // Note: transform.getTransform() is an ExecutableStage, not ParDo, so we need to extract
    // these info from its components.
    final DoFnSchemaInformation doFnSchemaInformation = null;
    final RunnerApi.PCollection input = pipeline.getComponents().getPcollectionsOrThrow(inputId);
    final PCollection.IsBounded isBounded = SamzaPipelineTranslatorUtils.isBounded(input);
    final Coder<?> keyCoder = StateUtils.isStateful(stagePayload) ? ((KvCoder) ((WindowedValue.FullWindowedValueCoder) windowedInputCoder).getValueCoder()).getKeyCoder() : null;
    final DoFnOp<InT, OutT, RawUnionValue> op = new DoFnOp<>(mainOutputTag, new NoOpDoFn<>(), keyCoder, // input coder not in use
    windowedInputCoder.getValueCoder(), windowedInputCoder, // output coders not in use
    Collections.emptyMap(), new ArrayList<>(sideInputMapping.values()), // used by java runner only
    new ArrayList<>(idToTupleTagMap.values()), WindowUtils.getWindowStrategy(inputId, stagePayload.getComponents()), idToViewMapping, new DoFnOp.MultiOutputManagerFactory(tagToIndexMap), ctx.getTransformFullName(), ctx.getTransformId(), isBounded, true, stagePayload, ctx.getJobInfo(), idToTupleTagMap, doFnSchemaInformation, sideInputMapping);
    final MessageStream<OpMessage<InT>> mergedStreams;
    if (sideInputStreams.isEmpty()) {
        mergedStreams = inputStream;
    } else {
        MessageStream<OpMessage<InT>> mergedSideInputStreams = MessageStream.mergeAll(sideInputStreams).flatMap(new SideInputWatermarkFn());
        mergedStreams = inputStream.merge(Collections.singletonList(mergedSideInputStreams));
    }
    final MessageStream<OpMessage<RawUnionValue>> taggedOutputStream = mergedStreams.flatMapAsync(OpAdapter.adapt(op));
    for (int outputIndex : tagToIndexMap.values()) {
        @SuppressWarnings("unchecked") final MessageStream<OpMessage<OutT>> outputStream = taggedOutputStream.filter(message -> message.getType() != OpMessage.Type.ELEMENT || message.getElement().getValue().getUnionTag() == outputIndex).flatMapAsync(OpAdapter.adapt(new RawUnionValueToValue()));
        ctx.registerMessageStream(indexToIdMap.get(outputIndex), outputStream);
    }
}
Also used : WindowedValue(org.apache.beam.sdk.util.WindowedValue) PCollectionViews(org.apache.beam.sdk.values.PCollectionViews) OpMessage(org.apache.beam.runners.samza.runtime.OpMessage) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) RunnerPCollectionView(org.apache.beam.runners.core.construction.RunnerPCollectionView) WatermarkFunction(org.apache.samza.operators.functions.WatermarkFunction) DoFnSignatures(org.apache.beam.sdk.transforms.reflect.DoFnSignatures) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) Map(java.util.Map) Iterators(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterators) StateUtils(org.apache.beam.runners.samza.util.StateUtils) KvCoder(org.apache.beam.sdk.coders.KvCoder) Collection(java.util.Collection) ServiceLoader(java.util.ServiceLoader) Collectors(java.util.stream.Collectors) TransformHierarchy(org.apache.beam.sdk.runners.TransformHierarchy) FlatMapFunction(org.apache.samza.operators.functions.FlatMapFunction) List(java.util.List) ParDo(org.apache.beam.sdk.transforms.ParDo) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) OpEmitter(org.apache.beam.runners.samza.runtime.OpEmitter) PipelineTranslatorUtils.instantiateCoder(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.instantiateCoder) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) SamzaPipelineTranslatorUtils(org.apache.beam.runners.samza.util.SamzaPipelineTranslatorUtils) KV(org.apache.beam.sdk.values.KV) TypeDescriptor(org.apache.beam.sdk.values.TypeDescriptor) OpAdapter(org.apache.beam.runners.samza.runtime.OpAdapter) DoFnOp(org.apache.beam.runners.samza.runtime.DoFnOp) Coder(org.apache.beam.sdk.coders.Coder) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) ViewFn(org.apache.beam.sdk.transforms.ViewFn) TupleTag(org.apache.beam.sdk.values.TupleTag) SamzaDoFnInvokerRegistrar(org.apache.beam.runners.samza.runtime.SamzaDoFnInvokerRegistrar) WindowUtils(org.apache.beam.runners.samza.util.WindowUtils) SideInputId(org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload.SideInputId) ParDoTranslation(org.apache.beam.runners.core.construction.ParDoTranslation) MessageStream(org.apache.samza.operators.MessageStream) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) DoFn(org.apache.beam.sdk.transforms.DoFn) QueryablePipeline(org.apache.beam.runners.core.construction.graph.QueryablePipeline) Op(org.apache.beam.runners.samza.runtime.Op) DoFnSignature(org.apache.beam.sdk.transforms.reflect.DoFnSignature) Iterator(java.util.Iterator) IterableCoder(org.apache.beam.sdk.coders.IterableCoder) IOException(java.io.IOException) PCollection(org.apache.beam.sdk.values.PCollection) SamzaPipelineOptions(org.apache.beam.runners.samza.SamzaPipelineOptions) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) Instant(org.joda.time.Instant) PipelineNode(org.apache.beam.runners.core.construction.graph.PipelineNode) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) Collections(java.util.Collections) OpMessage(org.apache.beam.runners.samza.runtime.OpMessage) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TupleTag(org.apache.beam.sdk.values.TupleTag) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) WindowedValue(org.apache.beam.sdk.util.WindowedValue) MessageStream(org.apache.samza.operators.MessageStream) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) SideInputId(org.apache.beam.model.pipeline.v1.RunnerApi.ExecutableStagePayload.SideInputId) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) RunnerPCollectionView(org.apache.beam.runners.core.construction.RunnerPCollectionView) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) DoFnOp(org.apache.beam.runners.samza.runtime.DoFnOp) IOException(java.io.IOException) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) PCollection(org.apache.beam.sdk.values.PCollection)

Aggregations

DoFnSchemaInformation (org.apache.beam.sdk.transforms.DoFnSchemaInformation)14 PCollection (org.apache.beam.sdk.values.PCollection)12 HashMap (java.util.HashMap)10 Map (java.util.Map)10 Coder (org.apache.beam.sdk.coders.Coder)10 PCollectionView (org.apache.beam.sdk.values.PCollectionView)10 TupleTag (org.apache.beam.sdk.values.TupleTag)10 ParDo (org.apache.beam.sdk.transforms.ParDo)9 WindowedValue (org.apache.beam.sdk.util.WindowedValue)8 IOException (java.io.IOException)7 ArrayList (java.util.ArrayList)7 WindowingStrategy (org.apache.beam.sdk.values.WindowingStrategy)7 List (java.util.List)6 Collectors (java.util.stream.Collectors)5 KvCoder (org.apache.beam.sdk.coders.KvCoder)5 DoFn (org.apache.beam.sdk.transforms.DoFn)5 RawUnionValue (org.apache.beam.sdk.transforms.join.RawUnionValue)5 DoFnSignature (org.apache.beam.sdk.transforms.reflect.DoFnSignature)5 DoFnSignatures (org.apache.beam.sdk.transforms.reflect.DoFnSignatures)5 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)5