Search in sources :

Example 1 with SideInputInfo

use of com.google.api.services.dataflow.model.SideInputInfo in project beam by apache.

the class LengthPrefixUnknownCodersTest method createSideInputInfosWithCoders.

private static SideInputInfo createSideInputInfosWithCoders(Coder<?>... coders) {
    SideInputInfo sideInputInfo = new SideInputInfo().setSources(new ArrayList<>());
    sideInputInfo.setFactory(new JacksonFactory());
    for (Coder<?> coder : coders) {
        Source source = new Source().setCodec(CloudObjects.asCloudObject(coder, /*sdkComponents=*/
        null));
        source.setFactory(new JacksonFactory());
        sideInputInfo.getSources().add(source);
    }
    return sideInputInfo;
}
Also used : SideInputInfo(com.google.api.services.dataflow.model.SideInputInfo) JacksonFactory(com.google.api.client.json.jackson2.JacksonFactory) Source(com.google.api.services.dataflow.model.Source)

Example 2 with SideInputInfo

use of com.google.api.services.dataflow.model.SideInputInfo in project beam by apache.

the class LazilyInitializedSideInputReaderTest method testLazyInitialization.

@Test
public void testLazyInitialization() {
    final AtomicInteger wasCalled = new AtomicInteger();
    SideInputReader lazilyInitializedSideInputReader = new LazilyInitializedSideInputReader(ImmutableList.of(new SideInputInfo().setTag(TEST_TAG)), () -> {
        wasCalled.incrementAndGet();
        return mockSideInputReader;
    });
    // Ensure that after construction we have not been initialized yet.
    assertEquals(0, wasCalled.get());
    // Ensure that after checking some basic tag information we have not been initialized yet.
    assertFalse(lazilyInitializedSideInputReader.isEmpty());
    assertEquals(0, wasCalled.get());
    when(mockPCollectionView.getTagInternal()).thenReturn(new TupleTag(TEST_TAG));
    assertTrue(lazilyInitializedSideInputReader.contains(mockPCollectionView));
    assertEquals(0, wasCalled.get());
    // Ensure that we were constructed only once, and provided the expected parameters and returned
    // the expected result.
    when(mockSideInputReader.get(any(PCollectionView.class), any(BoundedWindow.class))).thenReturn(42).thenReturn(43);
    assertEquals(42, lazilyInitializedSideInputReader.get(mockPCollectionView, GlobalWindow.INSTANCE));
    assertEquals(1, wasCalled.get());
    assertEquals(43, lazilyInitializedSideInputReader.get(mockPCollectionView, GlobalWindow.INSTANCE));
    assertEquals(1, wasCalled.get());
}
Also used : SideInputInfo(com.google.api.services.dataflow.model.SideInputInfo) AtomicInteger(java.util.concurrent.atomic.AtomicInteger) TupleTag(org.apache.beam.sdk.values.TupleTag) SideInputReader(org.apache.beam.runners.core.SideInputReader) Test(org.junit.Test)

Example 3 with SideInputInfo

use of com.google.api.services.dataflow.model.SideInputInfo in project beam by apache.

the class IsmSideInputReaderTest method toSideInputInfo.

private SideInputInfo toSideInputInfo(String tagId, Source... sources) {
    SideInputInfo sideInputInfo = new SideInputInfo();
    sideInputInfo.setTag(tagId);
    sideInputInfo.setKind(new HashMap<String, Object>());
    if (sources.length == 1) {
        sideInputInfo.getKind().put(PropertyNames.OBJECT_TYPE_NAME, "singleton");
    } else {
        sideInputInfo.getKind().put(PropertyNames.OBJECT_TYPE_NAME, "collection");
    }
    sideInputInfo.setSources(new ArrayList<>(Arrays.asList(sources)));
    return sideInputInfo;
}
Also used : SideInputInfo(com.google.api.services.dataflow.model.SideInputInfo) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString)

Example 4 with SideInputInfo

use of com.google.api.services.dataflow.model.SideInputInfo in project beam by apache.

the class LengthPrefixUnknownCoders method forSideInputInfos.

/**
 * Wraps unknown coders on every {@link SideInputInfo} with length prefixes and also replaces the
 * wrapped coder with a byte array coder if requested.
 */
public static List<SideInputInfo> forSideInputInfos(List<SideInputInfo> sideInputInfos, boolean replaceWithByteArrayCoder) {
    ImmutableList.Builder<SideInputInfo> updatedSideInputInfos = ImmutableList.builder();
    for (SideInputInfo sideInputInfo : sideInputInfos) {
        try {
            SideInputInfo updatedSideInputInfo = clone(sideInputInfo, SideInputInfo.class);
            for (Source source : updatedSideInputInfo.getSources()) {
                source.setCodec(forCodec(source.getCodec(), replaceWithByteArrayCoder));
            }
            updatedSideInputInfos.add(updatedSideInputInfo);
        } catch (IOException e) {
            throw new RuntimeException(String.format("Failed to replace unknown coder with " + "LengthPrefixCoder for : {%s}", sideInputInfo), e);
        }
    }
    return updatedSideInputInfos.build();
}
Also used : SideInputInfo(com.google.api.services.dataflow.model.SideInputInfo) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) IOException(java.io.IOException) Source(com.google.api.services.dataflow.model.Source)

Example 5 with SideInputInfo

use of com.google.api.services.dataflow.model.SideInputInfo in project beam by apache.

the class GroupAlsoByWindowParDoFnFactory method create.

@Override
public ParDoFn create(PipelineOptions options, CloudObject cloudUserFn, @Nullable List<SideInputInfo> sideInputInfos, TupleTag<?> mainOutputTag, Map<TupleTag<?>, Integer> outputTupleTagsToReceiverIndices, final DataflowExecutionContext<?> executionContext, DataflowOperationContext operationContext) throws Exception {
    Map.Entry<TupleTag<?>, Integer> entry = Iterables.getOnlyElement(outputTupleTagsToReceiverIndices.entrySet());
    checkArgument(entry.getKey().equals(mainOutputTag), "Output tags should reference only the main output tag: %s vs %s", entry.getKey(), mainOutputTag);
    checkArgument(entry.getValue() == 0, "There should be a single receiver, but using receiver index %s", entry.getValue());
    byte[] encodedWindowingStrategy = getBytes(cloudUserFn, PropertyNames.SERIALIZED_FN);
    WindowingStrategy windowingStrategy;
    try {
        windowingStrategy = deserializeWindowingStrategy(encodedWindowingStrategy);
    } catch (Exception e) {
        // TODO: Catch block disappears, becoming an error once Python SDK is compliant.
        if (DataflowRunner.hasExperiment(options.as(DataflowPipelineDebugOptions.class), "beam_fn_api")) {
            LOG.info("FnAPI: Unable to deserialize windowing strategy, assuming default", e);
            windowingStrategy = WindowingStrategy.globalDefault();
        } else {
            throw e;
        }
    }
    byte[] serializedCombineFn = getBytes(cloudUserFn, WorkerPropertyNames.COMBINE_FN, null);
    AppliedCombineFn<?, ?, ?, ?> combineFn = null;
    if (serializedCombineFn != null) {
        Object combineFnObj = SerializableUtils.deserializeFromByteArray(serializedCombineFn, "serialized combine fn");
        checkArgument(combineFnObj instanceof AppliedCombineFn, "unexpected kind of AppliedCombineFn: " + combineFnObj.getClass().getName());
        combineFn = (AppliedCombineFn<?, ?, ?, ?>) combineFnObj;
    }
    Map<String, Object> inputCoderObject = getObject(cloudUserFn, WorkerPropertyNames.INPUT_CODER);
    Coder<?> inputCoder = CloudObjects.coderFromCloudObject(CloudObject.fromSpec(inputCoderObject));
    checkArgument(inputCoder instanceof WindowedValueCoder, "Expected WindowedValueCoder for inputCoder, got: " + inputCoder.getClass().getName());
    @SuppressWarnings("unchecked") WindowedValueCoder<?> windowedValueCoder = (WindowedValueCoder<?>) inputCoder;
    Coder<?> elemCoder = windowedValueCoder.getValueCoder();
    checkArgument(elemCoder instanceof KvCoder, "Expected KvCoder for inputCoder, got: " + elemCoder.getClass().getName());
    @SuppressWarnings("unchecked") KvCoder<?, ?> kvCoder = (KvCoder<?, ?>) elemCoder;
    boolean isStreamingPipeline = options.as(StreamingOptions.class).isStreaming();
    SideInputReader sideInputReader = NullSideInputReader.empty();
    @Nullable AppliedCombineFn<?, ?, ?, ?> maybeMergingCombineFn = null;
    if (combineFn != null) {
        sideInputReader = executionContext.getSideInputReader(sideInputInfos, combineFn.getSideInputViews(), operationContext);
        String phase = getString(cloudUserFn, WorkerPropertyNames.PHASE, CombinePhase.ALL);
        checkArgument(phase.equals(CombinePhase.ALL) || phase.equals(CombinePhase.MERGE), "Unexpected phase: %s", phase);
        if (phase.equals(CombinePhase.MERGE)) {
            maybeMergingCombineFn = makeAppliedMergingFunction(combineFn);
        } else {
            maybeMergingCombineFn = combineFn;
        }
    }
    StateInternalsFactory<?> stateInternalsFactory = key -> executionContext.getStepContext(operationContext).stateInternals();
    // This will be a GABW Fn for either batch or streaming, with combiner in it or not
    GroupAlsoByWindowFn<?, ?> fn;
    // This will be a FakeKeyedWorkItemCoder for streaming or null for batch
    Coder<?> gabwInputCoder;
    // TODO: do not do this with mess of "if"
    if (isStreamingPipeline) {
        if (maybeMergingCombineFn == null) {
            fn = StreamingGroupAlsoByWindowsDoFns.createForIterable(windowingStrategy, stateInternalsFactory, ((KvCoder) kvCoder).getValueCoder());
            gabwInputCoder = WindmillKeyedWorkItem.FakeKeyedWorkItemCoder.of(kvCoder);
        } else {
            fn = StreamingGroupAlsoByWindowsDoFns.create(windowingStrategy, stateInternalsFactory, (AppliedCombineFn) maybeMergingCombineFn, ((KvCoder) kvCoder).getKeyCoder());
            gabwInputCoder = WindmillKeyedWorkItem.FakeKeyedWorkItemCoder.of(((AppliedCombineFn) maybeMergingCombineFn).getKvCoder());
        }
    } else {
        if (maybeMergingCombineFn == null) {
            fn = BatchGroupAlsoByWindowsDoFns.createForIterable(windowingStrategy, stateInternalsFactory, ((KvCoder) kvCoder).getValueCoder());
            gabwInputCoder = null;
        } else {
            fn = BatchGroupAlsoByWindowsDoFns.create(windowingStrategy, (AppliedCombineFn) maybeMergingCombineFn);
            gabwInputCoder = null;
        }
    }
    // TODO: or anyhow related to it, do not do this with mess of "if"
    if (maybeMergingCombineFn != null) {
        return new GroupAlsoByWindowsParDoFn(options, fn, windowingStrategy, ((AppliedCombineFn) maybeMergingCombineFn).getSideInputViews(), gabwInputCoder, sideInputReader, mainOutputTag, executionContext.getStepContext(operationContext));
    } else {
        return new GroupAlsoByWindowsParDoFn(options, fn, windowingStrategy, null, gabwInputCoder, sideInputReader, mainOutputTag, executionContext.getStepContext(operationContext));
    }
}
Also used : CombineFn(org.apache.beam.sdk.transforms.Combine.CombineFn) StateInternalsFactory(org.apache.beam.runners.core.StateInternalsFactory) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) CombineFnWithContext(org.apache.beam.sdk.transforms.CombineWithContext.CombineFnWithContext) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) LoggerFactory(org.slf4j.LoggerFactory) CloudObjects(org.apache.beam.runners.dataflow.util.CloudObjects) BatchGroupAlsoByWindowsDoFns(org.apache.beam.runners.dataflow.worker.util.BatchGroupAlsoByWindowsDoFns) Coder(org.apache.beam.sdk.coders.Coder) ListCoder(org.apache.beam.sdk.coders.ListCoder) RehydratedComponents(org.apache.beam.runners.core.construction.RehydratedComponents) ArrayList(java.util.ArrayList) GlobalCombineFn(org.apache.beam.sdk.transforms.CombineFnBase.GlobalCombineFn) DataflowPipelineDebugOptions(org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions) TupleTag(org.apache.beam.sdk.values.TupleTag) Map(java.util.Map) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Nullable(org.checkerframework.checker.nullness.qual.Nullable) Structs.getBytes(org.apache.beam.runners.dataflow.util.Structs.getBytes) SideInputInfo(com.google.api.services.dataflow.model.SideInputInfo) SideInputReader(org.apache.beam.runners.core.SideInputReader) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) CannotProvideCoderException(org.apache.beam.sdk.coders.CannotProvideCoderException) KvCoder(org.apache.beam.sdk.coders.KvCoder) AppliedCombineFn(org.apache.beam.sdk.util.AppliedCombineFn) Logger(org.slf4j.Logger) StreamingOptions(org.apache.beam.sdk.options.StreamingOptions) NullSideInputReader(org.apache.beam.runners.core.NullSideInputReader) DataflowRunner(org.apache.beam.runners.dataflow.DataflowRunner) WorkerPropertyNames(org.apache.beam.runners.dataflow.worker.util.WorkerPropertyNames) RootCase(org.apache.beam.model.pipeline.v1.RunnerApi.MessageWithComponents.RootCase) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) ParDoFn(org.apache.beam.runners.dataflow.worker.util.common.worker.ParDoFn) List(java.util.List) WindowingStrategyTranslation(org.apache.beam.runners.core.construction.WindowingStrategyTranslation) Structs.getObject(org.apache.beam.runners.dataflow.util.Structs.getObject) SerializableUtils(org.apache.beam.sdk.util.SerializableUtils) Context(org.apache.beam.sdk.transforms.CombineWithContext.Context) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) PropertyNames(org.apache.beam.runners.dataflow.util.PropertyNames) StreamingOptions(org.apache.beam.sdk.options.StreamingOptions) TupleTag(org.apache.beam.sdk.values.TupleTag) SideInputReader(org.apache.beam.runners.core.SideInputReader) NullSideInputReader(org.apache.beam.runners.core.NullSideInputReader) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) KvCoder(org.apache.beam.sdk.coders.KvCoder) AppliedCombineFn(org.apache.beam.sdk.util.AppliedCombineFn) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) CannotProvideCoderException(org.apache.beam.sdk.coders.CannotProvideCoderException) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject) Structs.getObject(org.apache.beam.runners.dataflow.util.Structs.getObject) Map(java.util.Map) Nullable(org.checkerframework.checker.nullness.qual.Nullable)

Aggregations

SideInputInfo (com.google.api.services.dataflow.model.SideInputInfo)5 Source (com.google.api.services.dataflow.model.Source)2 SideInputReader (org.apache.beam.runners.core.SideInputReader)2 Structs.getString (org.apache.beam.runners.dataflow.util.Structs.getString)2 TupleTag (org.apache.beam.sdk.values.TupleTag)2 JacksonFactory (com.google.api.client.json.jackson2.JacksonFactory)1 IOException (java.io.IOException)1 ArrayList (java.util.ArrayList)1 List (java.util.List)1 Map (java.util.Map)1 AtomicInteger (java.util.concurrent.atomic.AtomicInteger)1 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)1 RootCase (org.apache.beam.model.pipeline.v1.RunnerApi.MessageWithComponents.RootCase)1 NullSideInputReader (org.apache.beam.runners.core.NullSideInputReader)1 StateInternalsFactory (org.apache.beam.runners.core.StateInternalsFactory)1 RehydratedComponents (org.apache.beam.runners.core.construction.RehydratedComponents)1 WindowingStrategyTranslation (org.apache.beam.runners.core.construction.WindowingStrategyTranslation)1 DataflowRunner (org.apache.beam.runners.dataflow.DataflowRunner)1 DataflowPipelineDebugOptions (org.apache.beam.runners.dataflow.options.DataflowPipelineDebugOptions)1 CloudObject (org.apache.beam.runners.dataflow.util.CloudObject)1