Search in sources :

Example 1 with SerializablePipelineOptions

use of org.apache.beam.runners.core.construction.SerializablePipelineOptions in project beam by apache.

the class StreamingTransformTranslator method parDo.

private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
    return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {

        @Override
        public void evaluate(final ParDo.MultiOutput<InputT, OutputT> transform, final EvaluationContext context) {
            final DoFn<InputT, OutputT> doFn = transform.getFn();
            checkArgument(!DoFnSignatures.signatureForDoFn(doFn).processElement().isSplittable(), "Splittable DoFn not yet supported in streaming mode: %s", doFn);
            rejectStateAndTimers(doFn);
            final SerializablePipelineOptions options = context.getSerializableOptions();
            final SparkPCollectionView pviews = context.getPViews();
            final WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
            Coder<InputT> inputCoder = (Coder<InputT>) context.getInput(transform).getCoder();
            Map<TupleTag<?>, Coder<?>> outputCoders = context.getOutputCoders();
            @SuppressWarnings("unchecked") UnboundedDataset<InputT> unboundedDataset = (UnboundedDataset<InputT>) context.borrowDataset(transform);
            JavaDStream<WindowedValue<InputT>> dStream = unboundedDataset.getDStream();
            final DoFnSchemaInformation doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
            final Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
            final String stepName = context.getCurrentTransform().getFullName();
            JavaPairDStream<TupleTag<?>, WindowedValue<?>> all = dStream.transformToPair(rdd -> {
                final MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();
                final Map<TupleTag<?>, KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>>> sideInputs = TranslationUtils.getSideInputs(transform.getSideInputs().values(), JavaSparkContext.fromSparkContext(rdd.context()), pviews);
                return rdd.mapPartitionsToPair(new MultiDoFnFunction<>(metricsAccum, stepName, doFn, options, transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), inputCoder, outputCoders, sideInputs, windowingStrategy, false, doFnSchemaInformation, sideInputMapping));
            });
            Map<TupleTag<?>, PCollection<?>> outputs = context.getOutputs(transform);
            if (outputs.size() > 1) {
                // Caching can cause Serialization, we need to code to bytes
                // more details in https://issues.apache.org/jira/browse/BEAM-2669
                Map<TupleTag<?>, Coder<WindowedValue<?>>> coderMap = TranslationUtils.getTupleTagCoders(outputs);
                all = all.mapToPair(TranslationUtils.getTupleTagEncodeFunction(coderMap)).cache().mapToPair(TranslationUtils.getTupleTagDecodeFunction(coderMap));
            }
            for (Map.Entry<TupleTag<?>, PCollection<?>> output : outputs.entrySet()) {
                @SuppressWarnings("unchecked") JavaPairDStream<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
                @SuppressWarnings("unchecked") JavaDStream<WindowedValue<Object>> // Object is the best we can do since different outputs can have different tags
                values = (JavaDStream<WindowedValue<Object>>) (JavaDStream<?>) TranslationUtils.dStreamValues(filtered);
                context.putDataset(output.getValue(), new UnboundedDataset<>(values, unboundedDataset.getStreamSources()));
            }
        }

        @Override
        public String toNativeString() {
            return "mapPartitions(new <fn>())";
        }
    };
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) JavaDStream(org.apache.spark.streaming.api.java.JavaDStream) WindowedValue(org.apache.beam.sdk.util.WindowedValue) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) KvCoder(org.apache.beam.sdk.coders.KvCoder) Coder(org.apache.beam.sdk.coders.Coder) KV(org.apache.beam.sdk.values.KV) MetricsContainerStepMapAccumulator(org.apache.beam.runners.spark.metrics.MetricsContainerStepMapAccumulator) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) TranslationUtils(org.apache.beam.runners.spark.translation.TranslationUtils) PCollection(org.apache.beam.sdk.values.PCollection) SparkPCollectionView(org.apache.beam.runners.spark.translation.SparkPCollectionView) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) ParDo(org.apache.beam.sdk.transforms.ParDo) SplittableParDo(org.apache.beam.runners.core.construction.SplittableParDo) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) HashMap(java.util.HashMap) SparkPCollectionView(org.apache.beam.runners.spark.translation.SparkPCollectionView)

Example 2 with SerializablePipelineOptions

use of org.apache.beam.runners.core.construction.SerializablePipelineOptions in project beam by apache.

the class DoFnFunction method initTransient.

/**
 * Method used to initialize the transient variables that were sent over as byte arrays or proto
 * buffers.
 */
private void initTransient() {
    if (isInitialized) {
        return;
    }
    try {
        SdkComponents components = SdkComponents.create();
        pipelineOptions = new SerializablePipelineOptions(serializedOptions).get();
        DoFnWithExecutionInformation doFnWithExecutionInformation = (DoFnWithExecutionInformation) SerializableUtils.deserializeFromByteArray(doFnwithExBytes, "Custom Coder Bytes");
        this.doFn = (DoFn<InputT, OutputT>) doFnWithExecutionInformation.getDoFn();
        this.mainOutput = (TupleTag<OutputT>) doFnWithExecutionInformation.getMainOutputTag();
        this.sideInputMapping = doFnWithExecutionInformation.getSideInputMapping();
        this.doFnSchemaInformation = doFnWithExecutionInformation.getSchemaInformation();
        inputCoder = (Coder<InputT>) SerializableUtils.deserializeFromByteArray(coderBytes, "Custom Coder Bytes");
        windowStrategyProto = RunnerApi.MessageWithComponents.parseFrom(windowBytes);
        windowingStrategy = (WindowingStrategy<?, ?>) WindowingStrategyTranslation.fromProto(windowStrategyProto.getWindowingStrategy(), RehydratedComponents.forComponents(components.toComponents()));
        sideInputs = new HashMap<>();
        for (Map.Entry<String, byte[]> entry : sideInputBytes.entrySet()) {
            windowStrategyProto = RunnerApi.MessageWithComponents.parseFrom(entry.getValue());
            sideInputs.put(new TupleTag<>(entry.getKey()), WindowingStrategyTranslation.fromProto(windowStrategyProto.getWindowingStrategy(), RehydratedComponents.forComponents(components.toComponents())));
        }
    } catch (InvalidProtocolBufferException e) {
        LOG.info(e.getMessage());
    }
    outputCoders = new HashMap<>();
    for (Map.Entry<String, byte[]> entry : outputCodersBytes.entrySet()) {
        outputCoders.put(new TupleTag<>(entry.getKey()), (Coder<?>) SerializableUtils.deserializeFromByteArray(entry.getValue(), "Custom Coder Bytes"));
    }
    sideOutputs = new ArrayList<>();
    for (String sideOutput : serializedSideOutputs) {
        sideOutputs.add(new TupleTag<>(sideOutput));
    }
    outputMap = new HashMap<>();
    for (Map.Entry<String, Integer> entry : serializedOutputMap.entrySet()) {
        outputMap.put(new TupleTag<>(entry.getKey()), entry.getValue());
    }
    outputManager = new DoFnOutputManager(this.outputMap);
    this.isInitialized = true;
}
Also used : InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) DoFnWithExecutionInformation(org.apache.beam.sdk.util.DoFnWithExecutionInformation) SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) HashMap(java.util.HashMap) Map(java.util.Map)

Example 3 with SerializablePipelineOptions

use of org.apache.beam.runners.core.construction.SerializablePipelineOptions in project beam by apache.

the class DoFnFunction method prepareSerialization.

/**
 * prepares the DoFnFunction class so it can be serialized properly. This involves using various
 * protobuf's and byte arrays which are later converted back into the proper classes during
 * deserialization.
 */
private void prepareSerialization() {
    SdkComponents components = SdkComponents.create();
    components.registerEnvironment(Environments.createOrGetDefaultEnvironment(pipelineOptions.as(PortablePipelineOptions.class)));
    this.serializedOptions = new SerializablePipelineOptions(pipelineOptions).toString();
    doFnwithEx = ParDoTranslation.translateDoFn(this.doFn, mainOutput, sideInputMapping, doFnSchemaInformation, components);
    doFnwithExBytes = doFnwithEx.getPayload().toByteArray();
    outputCodersBytes = new HashMap<>();
    try {
        coderBytes = SerializableUtils.serializeToByteArray(inputCoder);
        windowStrategyProto = WindowingStrategyTranslation.toMessageProto(windowingStrategy, components);
        windowBytes = windowStrategyProto.toByteArray();
        for (Map.Entry<TupleTag<?>, Coder<?>> entry : outputCoders.entrySet()) {
            outputCodersBytes.put(entry.getKey().getId(), SerializableUtils.serializeToByteArray(entry.getValue()));
        }
        sideInputBytes = new HashMap<>();
        for (Map.Entry<TupleTag<?>, WindowingStrategy<?, ?>> entry : sideInputs.entrySet()) {
            windowStrategyProto = WindowingStrategyTranslation.toMessageProto(entry.getValue(), components);
            sideInputBytes.put(entry.getKey().getId(), windowStrategyProto.toByteArray());
        }
        serializedSideOutputs = new ArrayList<>();
        for (TupleTag<?> sideOutput : sideOutputs) {
            serializedSideOutputs.add(sideOutput.getId());
        }
        serializedOutputMap = new HashMap<>();
        for (Map.Entry<TupleTag<?>, Integer> entry : outputMap.entrySet()) {
            serializedOutputMap.put(entry.getKey().getId(), entry.getValue());
        }
    } catch (IOException e) {
        LOG.info(e.getMessage());
    }
}
Also used : Coder(org.apache.beam.sdk.coders.Coder) TupleTag(org.apache.beam.sdk.values.TupleTag) IOException(java.io.IOException) SdkComponents(org.apache.beam.runners.core.construction.SdkComponents) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) HashMap(java.util.HashMap) Map(java.util.Map)

Example 4 with SerializablePipelineOptions

use of org.apache.beam.runners.core.construction.SerializablePipelineOptions in project beam by apache.

the class CoderTypeSerializerTest method testWriteAndReadConfigSnapshot.

private void testWriteAndReadConfigSnapshot(Coder<String> coder) throws IOException {
    CoderTypeSerializer<String> serializer = new CoderTypeSerializer<>(coder, new SerializablePipelineOptions(PipelineOptionsFactory.create()));
    TypeSerializerSnapshot writtenSnapshot = serializer.snapshotConfiguration();
    ComparatorTestBase.TestOutputView outView = new ComparatorTestBase.TestOutputView();
    writtenSnapshot.writeSnapshot(outView);
    TypeSerializerSnapshot readSnapshot = new CoderTypeSerializer.LegacySnapshot();
    readSnapshot.readSnapshot(writtenSnapshot.getCurrentVersion(), outView.getInputView(), getClass().getClassLoader());
    assertThat(readSnapshot.restoreSerializer(), is(serializer));
}
Also used : ComparatorTestBase(org.apache.flink.api.common.typeutils.ComparatorTestBase) TypeSerializerSnapshot(org.apache.flink.api.common.typeutils.TypeSerializerSnapshot) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions)

Example 5 with SerializablePipelineOptions

use of org.apache.beam.runners.core.construction.SerializablePipelineOptions in project beam by apache.

the class ExecutableStageDoFnOperatorTest method sdkErrorsSurfaceOnClose.

@Test
public void sdkErrorsSurfaceOnClose() throws Exception {
    TupleTag<Integer> mainOutput = new TupleTag<>("main-output");
    DoFnOperator.MultiOutputOutputManagerFactory<Integer> outputManagerFactory = new DoFnOperator.MultiOutputOutputManagerFactory(mainOutput, VoidCoder.of(), new SerializablePipelineOptions(FlinkPipelineOptions.defaults()));
    ExecutableStageDoFnOperator<Integer, Integer> operator = getOperator(mainOutput, Collections.emptyList(), outputManagerFactory);
    OneInputStreamOperatorTestHarness<WindowedValue<Integer>, WindowedValue<Integer>> testHarness = new OneInputStreamOperatorTestHarness<>(operator);
    testHarness.open();
    @SuppressWarnings("unchecked") RemoteBundle bundle = Mockito.mock(RemoteBundle.class);
    when(stageBundleFactory.getBundle(any(), any(), any(), any(), any(), any())).thenReturn(bundle);
    @SuppressWarnings("unchecked") FnDataReceiver<WindowedValue<?>> receiver = Mockito.mock(FnDataReceiver.class);
    when(bundle.getInputReceivers()).thenReturn(ImmutableMap.of("input", receiver));
    Exception expected = new RuntimeException(new Exception());
    doThrow(expected).when(bundle).close();
    thrown.expectCause(is(expected));
    operator.processElement(new StreamRecord<>(WindowedValue.valueInGlobalWindow(0)));
    testHarness.close();
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) KeyedOneInputStreamOperatorTestHarness(org.apache.flink.streaming.util.KeyedOneInputStreamOperatorTestHarness) OneInputStreamOperatorTestHarness(org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness) ExpectedException(org.junit.rules.ExpectedException) WindowedValue(org.apache.beam.sdk.util.WindowedValue) StreamRecordStripper.stripStreamRecordFromWindowedValue(org.apache.beam.runners.flink.translation.wrappers.streaming.StreamRecordStripper.stripStreamRecordFromWindowedValue) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) RemoteBundle(org.apache.beam.runners.fnexecution.control.RemoteBundle) Test(org.junit.Test) FlinkStateInternalsTest(org.apache.beam.runners.flink.streaming.FlinkStateInternalsTest)

Aggregations

SerializablePipelineOptions (org.apache.beam.runners.core.construction.SerializablePipelineOptions)37 TupleTag (org.apache.beam.sdk.values.TupleTag)29 WindowedValue (org.apache.beam.sdk.util.WindowedValue)28 StreamRecordStripper.stripStreamRecordFromWindowedValue (org.apache.beam.runners.flink.translation.wrappers.streaming.StreamRecordStripper.stripStreamRecordFromWindowedValue)24 Test (org.junit.Test)23 KeyedOneInputStreamOperatorTestHarness (org.apache.flink.streaming.util.KeyedOneInputStreamOperatorTestHarness)20 FlinkPipelineOptions (org.apache.beam.runners.flink.FlinkPipelineOptions)18 StringUtf8Coder (org.apache.beam.sdk.coders.StringUtf8Coder)16 KV (org.apache.beam.sdk.values.KV)16 OneInputStreamOperatorTestHarness (org.apache.flink.streaming.util.OneInputStreamOperatorTestHarness)15 ByteBuffer (java.nio.ByteBuffer)13 Coder (org.apache.beam.sdk.coders.Coder)12 KvCoder (org.apache.beam.sdk.coders.KvCoder)12 HashMap (java.util.HashMap)11 Instant (org.joda.time.Instant)11 CoderTypeInformation (org.apache.beam.runners.flink.translation.types.CoderTypeInformation)10 VarIntCoder (org.apache.beam.sdk.coders.VarIntCoder)10 DoFn (org.apache.beam.sdk.transforms.DoFn)10 IntervalWindow (org.apache.beam.sdk.transforms.windowing.IntervalWindow)10 WindowingStrategy (org.apache.beam.sdk.values.WindowingStrategy)10