Search in sources :

Example 46 with Coder

use of org.apache.beam.sdk.coders.Coder in project beam by apache.

the class IsmSideInputReaderTest method testIsmReaderReferenceCaching.

@Test
public void testIsmReaderReferenceCaching() throws Exception {
    Coder<WindowedValue<Long>> valueCoder = WindowedValue.getFullCoder(VarLongCoder.of(), GLOBAL_WINDOW_CODER);
    final WindowedValue<Long> element = valueInGlobalWindow(42L);
    final PCollectionView<Long> view = Pipeline.create().apply(Create.empty(VarLongCoder.of())).apply(View.asSingleton());
    final Source source = initInputFile(fromValues(Arrays.asList(element)), IsmRecordCoder.of(1, 0, ImmutableList.<Coder<?>>of(GLOBAL_WINDOW_CODER), valueCoder));
    final Source emptySource = initInputFile(fromValues(Arrays.asList()), IsmRecordCoder.of(1, 0, ImmutableList.<Coder<?>>of(GLOBAL_WINDOW_CODER), valueCoder));
    final IsmSideInputReader reader = sideInputReader(view.getTagInternal().getId(), source, emptySource);
    assertTrue(reader.tagToIsmReaderMap.containsKey(view.getTagInternal()));
    assertEquals(1, reader.tagToIsmReaderMap.get(view.getTagInternal()).size());
    assertEquals(FileSystems.matchSingleFileSpec(getString(source.getSpec(), WorkerPropertyNames.FILENAME)).resourceId(), reader.tagToIsmReaderMap.get(view.getTagInternal()).get(0).getResourceId());
    assertTrue(reader.tagToEmptyIsmReaderMap.containsKey(view.getTagInternal()));
    assertEquals(1, reader.tagToEmptyIsmReaderMap.get(view.getTagInternal()).size());
    assertEquals(FileSystems.matchSingleFileSpec(getString(emptySource.getSpec(), WorkerPropertyNames.FILENAME)).resourceId(), reader.tagToEmptyIsmReaderMap.get(view.getTagInternal()).get(0).getResourceId());
}
Also used : KvCoder(org.apache.beam.sdk.coders.KvCoder) IterableCoder(org.apache.beam.sdk.coders.IterableCoder) IsmRecordCoder(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecordCoder) MapCoder(org.apache.beam.sdk.coders.MapCoder) FullWindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.FullWindowedValueCoder) VarLongCoder(org.apache.beam.sdk.coders.VarLongCoder) Coder(org.apache.beam.sdk.coders.Coder) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) BigEndianLongCoder(org.apache.beam.sdk.coders.BigEndianLongCoder) ByteArrayCoder(org.apache.beam.sdk.coders.ByteArrayCoder) MetadataKeyCoder(org.apache.beam.runners.dataflow.internal.IsmFormat.MetadataKeyCoder) WindowedValue(org.apache.beam.sdk.util.WindowedValue) Source(com.google.api.services.dataflow.model.Source) Test(org.junit.Test)

Example 47 with Coder

use of org.apache.beam.sdk.coders.Coder in project beam by apache.

the class IsmReaderFactoryTest method testFactory.

@Test
public void testFactory() throws Exception {
    WindowedValueCoder<?> coder = WindowedValue.getFullCoder(IsmRecordCoder.of(1, 0, ImmutableList.<Coder<?>>of(StringUtf8Coder.of()), VarLongCoder.of()), GlobalWindow.Coder.INSTANCE);
    String tmpFile = tmpFolder.newFile().getPath();
    ResourceId tmpResourceId = FileSystems.matchSingleFileSpec(tmpFile).resourceId();
    @SuppressWarnings("rawtypes") IsmReader<?> ismReader = (IsmReader) new IsmReaderFactory().create(createSpecForFilename(tmpFile), coder, options, executionContext, operationContext);
    assertEquals(coder.getValueCoder(), ismReader.getCoder());
    assertEquals(tmpResourceId, ismReader.getResourceId());
}
Also used : WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) Coder(org.apache.beam.sdk.coders.Coder) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) VarLongCoder(org.apache.beam.sdk.coders.VarLongCoder) IsmRecordCoder(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecordCoder) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) Test(org.junit.Test)

Example 48 with Coder

use of org.apache.beam.sdk.coders.Coder in project beam by apache.

the class UserParDoFnFactory method create.

@Override
public ParDoFn create(PipelineOptions options, CloudObject cloudUserFn, @Nullable List<SideInputInfo> sideInputInfos, TupleTag<?> mainOutputTag, Map<TupleTag<?>, Integer> outputTupleTagsToReceiverIndices, DataflowExecutionContext<?> executionContext, DataflowOperationContext operationContext) throws Exception {
    DoFnInstanceManager instanceManager = fnCache.get(operationContext.nameContext().systemName(), () -> DoFnInstanceManagers.cloningPool(doFnExtractor.getDoFnInfo(cloudUserFn), options));
    DoFnInfo<?, ?> doFnInfo = instanceManager.peek();
    DataflowExecutionContext.DataflowStepContext stepContext = executionContext.getStepContext(operationContext);
    Iterable<PCollectionView<?>> sideInputViews = doFnInfo.getSideInputViews();
    SideInputReader sideInputReader = executionContext.getSideInputReader(sideInputInfos, sideInputViews, operationContext);
    if (doFnInfo.getDoFn() instanceof BatchStatefulParDoOverrides.BatchStatefulDoFn) {
        // HACK: BatchStatefulDoFn is a class from DataflowRunner's overrides
        // that just instructs the worker to execute it differently. This will
        // be replaced by metadata in the Runner API payload
        BatchStatefulParDoOverrides.BatchStatefulDoFn fn = (BatchStatefulParDoOverrides.BatchStatefulDoFn) doFnInfo.getDoFn();
        DoFn underlyingFn = fn.getUnderlyingDoFn();
        return new BatchModeUngroupingParDoFn((BatchModeExecutionContext.StepContext) stepContext, new SimpleParDoFn(options, DoFnInstanceManagers.singleInstance(doFnInfo.withFn(underlyingFn)), sideInputReader, doFnInfo.getMainOutput(), outputTupleTagsToReceiverIndices, stepContext, operationContext, doFnInfo.getDoFnSchemaInformation(), doFnInfo.getSideInputMapping(), runnerFactory));
    } else if (doFnInfo.getDoFn() instanceof StreamingPCollectionViewWriterFn) {
        // HACK: StreamingPCollectionViewWriterFn is a class from
        // DataflowPipelineTranslator. Using the class as an indicator is a migration path
        // to simply having an indicator string.
        checkArgument(stepContext instanceof StreamingModeExecutionContext.StreamingModeStepContext, "stepContext must be a StreamingModeStepContext to use StreamingPCollectionViewWriterFn");
        DataflowRunner.StreamingPCollectionViewWriterFn<Object> writerFn = (StreamingPCollectionViewWriterFn<Object>) doFnInfo.getDoFn();
        return new StreamingPCollectionViewWriterParDoFn((StreamingModeExecutionContext.StreamingModeStepContext) stepContext, writerFn.getView().getTagInternal(), writerFn.getDataCoder(), (Coder<BoundedWindow>) doFnInfo.getWindowingStrategy().getWindowFn().windowCoder());
    } else {
        return new SimpleParDoFn(options, instanceManager, sideInputReader, doFnInfo.getMainOutput(), outputTupleTagsToReceiverIndices, stepContext, operationContext, doFnInfo.getDoFnSchemaInformation(), doFnInfo.getSideInputMapping(), runnerFactory);
    }
}
Also used : Coder(org.apache.beam.sdk.coders.Coder) SideInputReader(org.apache.beam.runners.core.SideInputReader) BatchStatefulParDoOverrides(org.apache.beam.runners.dataflow.BatchStatefulParDoOverrides) StreamingPCollectionViewWriterFn(org.apache.beam.runners.dataflow.DataflowRunner.StreamingPCollectionViewWriterFn) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFn(org.apache.beam.sdk.transforms.DoFn) ParDoFn(org.apache.beam.runners.dataflow.worker.util.common.worker.ParDoFn) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject)

Example 49 with Coder

use of org.apache.beam.sdk.coders.Coder in project beam by apache.

the class IsmSinkFactory method create.

@Override
public Sink<?> create(CloudObject spec, @Nullable Coder<?> coder, @Nullable PipelineOptions options, @Nullable DataflowExecutionContext executionContext, DataflowOperationContext operationContext) throws Exception {
    options = checkArgumentNotNull(options);
    coder = checkArgumentNotNull(coder);
    // The validity of this coder is checked in detail by the typed create, below
    @SuppressWarnings("unchecked") Coder<WindowedValue<IsmRecord<Object>>> typedCoder = (Coder<WindowedValue<IsmRecord<Object>>>) coder;
    String filename = getString(spec, WorkerPropertyNames.FILENAME);
    checkArgument(typedCoder instanceof WindowedValueCoder, "%s only supports using %s but got %s.", IsmSink.class, WindowedValueCoder.class, typedCoder);
    WindowedValueCoder<IsmRecord<Object>> windowedCoder = (WindowedValueCoder<IsmRecord<Object>>) typedCoder;
    checkArgument(windowedCoder.getValueCoder() instanceof IsmRecordCoder, "%s only supports using %s but got %s.", IsmSink.class, IsmRecordCoder.class, windowedCoder.getValueCoder());
    @SuppressWarnings("unchecked") IsmRecordCoder<Object> ismCoder = (IsmRecordCoder<Object>) windowedCoder.getValueCoder();
    long bloomFilterSizeLimitBytes = Math.max(MIN_BLOOM_FILTER_SIZE_BYTES, DoubleMath.roundToLong(BLOOM_FILTER_SIZE_LIMIT_MULTIPLIER * options.as(DataflowWorkerHarnessOptions.class).getWorkerCacheMb() * // Note the conversion from MiB to bytes
    1024 * 1024, RoundingMode.DOWN));
    return new IsmSink<>(FileSystems.matchNewResource(filename, false), ismCoder, bloomFilterSizeLimitBytes);
}
Also used : WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) Coder(org.apache.beam.sdk.coders.Coder) IsmRecordCoder(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecordCoder) IsmRecord(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecord) Structs.getString(org.apache.beam.runners.dataflow.util.Structs.getString) IsmRecordCoder(org.apache.beam.runners.dataflow.internal.IsmFormat.IsmRecordCoder) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) WindowedValue(org.apache.beam.sdk.util.WindowedValue) CloudObject(org.apache.beam.runners.dataflow.util.CloudObject)

Example 50 with Coder

use of org.apache.beam.sdk.coders.Coder in project beam by apache.

the class FlinkStreamingPortablePipelineTranslator method translateFlatten.

private <T> void translateFlatten(String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) {
    RunnerApi.PTransform transform = pipeline.getComponents().getTransformsOrThrow(id);
    Map<String, String> allInputs = transform.getInputsMap();
    if (allInputs.isEmpty()) {
        // create an empty dummy source to satisfy downstream operations
        // we cannot create an empty source in Flink, therefore we have to
        // add the flatMap that simply never forwards the single element
        long shutdownAfterIdleSourcesMs = context.getPipelineOptions().getShutdownSourcesAfterIdleMs();
        DataStreamSource<WindowedValue<byte[]>> dummySource = context.getExecutionEnvironment().addSource(new ImpulseSourceFunction(shutdownAfterIdleSourcesMs));
        DataStream<WindowedValue<T>> result = dummySource.<WindowedValue<T>>flatMap((s, collector) -> {
        // never return anything
        }).returns(new CoderTypeInformation<>(WindowedValue.getFullCoder((Coder<T>) VoidCoder.of(), GlobalWindow.Coder.INSTANCE), context.getPipelineOptions()));
        context.addDataStream(Iterables.getOnlyElement(transform.getOutputsMap().values()), result);
    } else {
        DataStream<T> result = null;
        // Determine DataStreams that we use as input several times. For those, we need to uniquify
        // input streams because Flink seems to swallow watermarks when we have a union of one and
        // the same stream.
        HashMultiset<DataStream<T>> inputCounts = HashMultiset.create();
        for (String input : allInputs.values()) {
            DataStream<T> current = context.getDataStreamOrThrow(input);
            inputCounts.add(current, 1);
        }
        for (String input : allInputs.values()) {
            DataStream<T> current = context.getDataStreamOrThrow(input);
            final int timesRequired = inputCounts.count(current);
            if (timesRequired > 1) {
                current = current.flatMap(new FlatMapFunction<T, T>() {

                    private static final long serialVersionUID = 1L;

                    @Override
                    public void flatMap(T t, Collector<T> collector) {
                        collector.collect(t);
                    }
                });
            }
            result = (result == null) ? current : result.union(current);
        }
        context.addDataStream(Iterables.getOnlyElement(transform.getOutputsMap().values()), result);
    }
}
Also used : SingletonKeyedWorkItemCoder(org.apache.beam.runners.flink.translation.wrappers.streaming.SingletonKeyedWorkItemCoder) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) FlinkExecutableStageContextFactory(org.apache.beam.runners.flink.translation.functions.FlinkExecutableStageContextFactory) CoderUtils(org.apache.beam.sdk.util.CoderUtils) WireCoders(org.apache.beam.runners.fnexecution.wire.WireCoders) UnboundedSource(org.apache.beam.sdk.io.UnboundedSource) PCollectionViews(org.apache.beam.sdk.values.PCollectionViews) SdkHarnessClient(org.apache.beam.runners.fnexecution.control.SdkHarnessClient) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) RunnerPCollectionView(org.apache.beam.runners.core.construction.RunnerPCollectionView) ImmutableSet(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableSet) TestStreamSource(org.apache.beam.runners.flink.translation.wrappers.streaming.io.TestStreamSource) Map(java.util.Map) TestStreamTranslation(org.apache.beam.runners.core.construction.TestStreamTranslation) GlobalWindow(org.apache.beam.sdk.transforms.windowing.GlobalWindow) JsonNode(com.fasterxml.jackson.databind.JsonNode) CoderTypeInformation(org.apache.beam.runners.flink.translation.types.CoderTypeInformation) KvCoder(org.apache.beam.sdk.coders.KvCoder) PTransformTranslation(org.apache.beam.runners.core.construction.PTransformTranslation) WindowDoFnOperator(org.apache.beam.runners.flink.translation.wrappers.streaming.WindowDoFnOperator) Set(java.util.Set) OutputTag(org.apache.flink.util.OutputTag) ExecutableStage(org.apache.beam.runners.core.construction.graph.ExecutableStage) ExecutableStageTranslation.generateNameFromStagePayload(org.apache.beam.runners.core.construction.ExecutableStageTranslation.generateNameFromStagePayload) FlatMapFunction(org.apache.flink.api.common.functions.FlatMapFunction) CoderException(org.apache.beam.sdk.coders.CoderException) WindowingStrategyTranslation(org.apache.beam.runners.core.construction.WindowingStrategyTranslation) TestStream(org.apache.beam.sdk.testing.TestStream) PipelineTranslatorUtils.instantiateCoder(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.instantiateCoder) ValueWithRecordId(org.apache.beam.sdk.values.ValueWithRecordId) KV(org.apache.beam.sdk.values.KV) TypeDescriptor(org.apache.beam.sdk.values.TypeDescriptor) ArrayList(java.util.ArrayList) LinkedHashMap(java.util.LinkedHashMap) StreamingImpulseSource(org.apache.beam.runners.flink.translation.wrappers.streaming.io.StreamingImpulseSource) RichMapFunction(org.apache.flink.api.common.functions.RichMapFunction) Collector(org.apache.flink.util.Collector) TupleTag(org.apache.beam.sdk.values.TupleTag) Maps(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Maps) BiMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.BiMap) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) QueryablePipeline(org.apache.beam.runners.core.construction.graph.QueryablePipeline) IterableCoder(org.apache.beam.sdk.coders.IterableCoder) SingleOutputStreamOperator(org.apache.flink.streaming.api.datastream.SingleOutputStreamOperator) IOException(java.io.IOException) DedupingOperator(org.apache.beam.runners.flink.translation.wrappers.streaming.io.DedupingOperator) BoundedSource(org.apache.beam.sdk.io.BoundedSource) TreeMap(java.util.TreeMap) PCollectionView(org.apache.beam.sdk.values.PCollectionView) AutoService(com.google.auto.service.AutoService) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) PipelineNode(org.apache.beam.runners.core.construction.graph.PipelineNode) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) UnboundedSourceWrapper(org.apache.beam.runners.flink.translation.wrappers.streaming.io.UnboundedSourceWrapper) FileSystems(org.apache.beam.sdk.io.FileSystems) SystemReduceFn(org.apache.beam.runners.core.SystemReduceFn) SerializablePipelineOptions(org.apache.beam.runners.core.construction.SerializablePipelineOptions) WindowedValue(org.apache.beam.sdk.util.WindowedValue) PipelineTranslatorUtils.getWindowingStrategy(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.getWindowingStrategy) WorkItemKeySelector(org.apache.beam.runners.flink.translation.wrappers.streaming.WorkItemKeySelector) KvToByteBufferKeySelector(org.apache.beam.runners.flink.translation.wrappers.streaming.KvToByteBufferKeySelector) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) RehydratedComponents(org.apache.beam.runners.core.construction.RehydratedComponents) DoFnOperator(org.apache.beam.runners.flink.translation.wrappers.streaming.DoFnOperator) ByteBuffer(java.nio.ByteBuffer) Sets(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Sets) Locale(java.util.Locale) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) JobInfo(org.apache.beam.runners.fnexecution.provisioning.JobInfo) TypeInformation(org.apache.flink.api.common.typeinfo.TypeInformation) KeyedWorkItem(org.apache.beam.runners.core.KeyedWorkItem) KeySelector(org.apache.flink.api.java.functions.KeySelector) TwoInputTransformation(org.apache.flink.streaming.api.transformations.TwoInputTransformation) KeyedStream(org.apache.flink.streaming.api.datastream.KeyedStream) String.format(java.lang.String.format) ModelCoders(org.apache.beam.runners.core.construction.ModelCoders) UnionCoder(org.apache.beam.sdk.transforms.join.UnionCoder) JobExecutionResult(org.apache.flink.api.common.JobExecutionResult) List(java.util.List) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) StreamExecutionEnvironment(org.apache.flink.streaming.api.environment.StreamExecutionEnvironment) HashMultiset(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.HashMultiset) PipelineTranslatorUtils.createOutputMap(org.apache.beam.runners.fnexecution.translation.PipelineTranslatorUtils.createOutputMap) ReadTranslation(org.apache.beam.runners.core.construction.ReadTranslation) ExecutableStageDoFnOperator(org.apache.beam.runners.flink.translation.wrappers.streaming.ExecutableStageDoFnOperator) Coder(org.apache.beam.sdk.coders.Coder) HashMap(java.util.HashMap) DataStreamSource(org.apache.flink.streaming.api.datastream.DataStreamSource) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) ImpulseSourceFunction(org.apache.beam.runners.flink.translation.functions.ImpulseSourceFunction) ViewFn(org.apache.beam.sdk.transforms.ViewFn) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) SdfByteBufferKeySelector(org.apache.beam.runners.flink.translation.wrappers.streaming.SdfByteBufferKeySelector) NativeTransforms(org.apache.beam.runners.core.construction.NativeTransforms) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) Configuration(org.apache.flink.configuration.Configuration) Lists(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Lists) DataStream(org.apache.flink.streaming.api.datastream.DataStream) ByteArrayCoder(org.apache.beam.sdk.coders.ByteArrayCoder) SourceInputFormat(org.apache.beam.runners.flink.translation.wrappers.SourceInputFormat) Collections(java.util.Collections) DataStream(org.apache.flink.streaming.api.datastream.DataStream) ImpulseSourceFunction(org.apache.beam.runners.flink.translation.functions.ImpulseSourceFunction) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) WindowedValue(org.apache.beam.sdk.util.WindowedValue) FlatMapFunction(org.apache.flink.api.common.functions.FlatMapFunction) Collector(org.apache.flink.util.Collector)

Aggregations

Coder (org.apache.beam.sdk.coders.Coder)119 KvCoder (org.apache.beam.sdk.coders.KvCoder)75 WindowedValue (org.apache.beam.sdk.util.WindowedValue)55 StringUtf8Coder (org.apache.beam.sdk.coders.StringUtf8Coder)44 Test (org.junit.Test)43 HashMap (java.util.HashMap)42 ArrayList (java.util.ArrayList)38 Map (java.util.Map)36 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)35 List (java.util.List)32 KV (org.apache.beam.sdk.values.KV)30 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)28 IterableCoder (org.apache.beam.sdk.coders.IterableCoder)28 PCollection (org.apache.beam.sdk.values.PCollection)28 TupleTag (org.apache.beam.sdk.values.TupleTag)24 ByteString (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)23 IOException (java.io.IOException)22 PCollectionView (org.apache.beam.sdk.values.PCollectionView)22 Instant (org.joda.time.Instant)21 WindowingStrategy (org.apache.beam.sdk.values.WindowingStrategy)20