Search in sources :

Example 61 with Coder

use of org.apache.beam.sdk.coders.Coder in project beam by apache.

the class WriteFiles method createWrite.

/**
   * A write is performed as sequence of three {@link ParDo}'s.
   *
   * <p>This singleton collection containing the WriteOperation is then used as a side
   * input to a ParDo over the PCollection of elements to write. In this bundle-writing phase,
   * {@link WriteOperation#createWriter} is called to obtain a {@link Writer}.
   * {@link Writer#open} and {@link Writer#close} are called in
   * {@link DoFn.StartBundle} and {@link DoFn.FinishBundle}, respectively, and
   * {@link Writer#write} method is called for every element in the bundle. The output
   * of this ParDo is a PCollection of <i>writer result</i> objects (see {@link FileBasedSink}
   * for a description of writer results)-one for each bundle.
   *
   * <p>The final do-once ParDo uses a singleton collection asinput and the collection of writer
   * results as a side-input. In this ParDo, {@link WriteOperation#finalize} is called
   * to finalize the write.
   *
   * <p>If the write of any element in the PCollection fails, {@link Writer#close} will be
   * called before the exception that caused the write to fail is propagated and the write result
   * will be discarded.
   *
   * <p>Since the {@link WriteOperation} is serialized after the initialization ParDo and
   * deserialized in the bundle-writing and finalization phases, any state change to the
   * WriteOperation object that occurs during initialization is visible in the latter
   * phases. However, the WriteOperation is not serialized after the bundle-writing
   * phase. This is why implementations should guarantee that
   * {@link WriteOperation#createWriter} does not mutate WriteOperation).
   */
private PDone createWrite(PCollection<T> input) {
    Pipeline p = input.getPipeline();
    if (!windowedWrites) {
        // Re-window the data into the global window and remove any existing triggers.
        input = input.apply(Window.<T>into(new GlobalWindows()).triggering(DefaultTrigger.of()).discardingFiredPanes());
    }
    // Perform the per-bundle writes as a ParDo on the input PCollection (with the
    // WriteOperation as a side input) and collect the results of the writes in a
    // PCollection. There is a dependency between this ParDo and the first (the
    // WriteOperation PCollection as a side input), so this will happen after the
    // initial ParDo.
    PCollection<FileResult> results;
    final PCollectionView<Integer> numShardsView;
    Coder<BoundedWindow> shardedWindowCoder = (Coder<BoundedWindow>) input.getWindowingStrategy().getWindowFn().windowCoder();
    if (computeNumShards == null && numShardsProvider == null) {
        numShardsView = null;
        results = input.apply("WriteBundles", ParDo.of(windowedWrites ? new WriteWindowedBundles() : new WriteUnwindowedBundles()));
    } else {
        List<PCollectionView<?>> sideInputs = Lists.newArrayList();
        if (computeNumShards != null) {
            numShardsView = input.apply(computeNumShards);
            sideInputs.add(numShardsView);
        } else {
            numShardsView = null;
        }
        PCollection<KV<Integer, Iterable<T>>> sharded = input.apply("ApplyShardLabel", ParDo.of(new ApplyShardingKey<T>(numShardsView, (numShardsView != null) ? null : numShardsProvider)).withSideInputs(sideInputs)).apply("GroupIntoShards", GroupByKey.<Integer, T>create());
        shardedWindowCoder = (Coder<BoundedWindow>) sharded.getWindowingStrategy().getWindowFn().windowCoder();
        results = sharded.apply("WriteShardedBundles", ParDo.of(new WriteShardedBundles()));
    }
    results.setCoder(FileResultCoder.of(shardedWindowCoder));
    if (windowedWrites) {
        // When processing streaming windowed writes, results will arrive multiple times. This
        // means we can't share the below implementation that turns the results into a side input,
        // as new data arriving into a side input does not trigger the listening DoFn. Instead
        // we aggregate the result set using a singleton GroupByKey, so the DoFn will be triggered
        // whenever new data arrives.
        PCollection<KV<Void, FileResult>> keyedResults = results.apply("AttachSingletonKey", WithKeys.<Void, FileResult>of((Void) null));
        keyedResults.setCoder(KvCoder.of(VoidCoder.of(), FileResultCoder.of(shardedWindowCoder)));
        // Is the continuation trigger sufficient?
        keyedResults.apply("FinalizeGroupByKey", GroupByKey.<Void, FileResult>create()).apply("Finalize", ParDo.of(new DoFn<KV<Void, Iterable<FileResult>>, Integer>() {

            @ProcessElement
            public void processElement(ProcessContext c) throws Exception {
                LOG.info("Finalizing write operation {}.", writeOperation);
                List<FileResult> results = Lists.newArrayList(c.element().getValue());
                writeOperation.finalize(results);
                LOG.debug("Done finalizing write operation");
            }
        }));
    } else {
        final PCollectionView<Iterable<FileResult>> resultsView = results.apply(View.<FileResult>asIterable());
        ImmutableList.Builder<PCollectionView<?>> sideInputs = ImmutableList.<PCollectionView<?>>builder().add(resultsView);
        if (numShardsView != null) {
            sideInputs.add(numShardsView);
        }
        // Finalize the write in another do-once ParDo on the singleton collection containing the
        // Writer. The results from the per-bundle writes are given as an Iterable side input.
        // The WriteOperation's state is the same as after its initialization in the first
        // do-once ParDo. There is a dependency between this ParDo and the parallel write (the writer
        // results collection as a side input), so it will happen after the parallel write.
        // For the non-windowed case, we guarantee that  if no data is written but the user has
        // set numShards, then all shards will be written out as empty files. For this reason we
        // use a side input here.
        PCollection<Void> singletonCollection = p.apply(Create.of((Void) null));
        singletonCollection.apply("Finalize", ParDo.of(new DoFn<Void, Integer>() {

            @ProcessElement
            public void processElement(ProcessContext c) throws Exception {
                LOG.info("Finalizing write operation {}.", writeOperation);
                List<FileResult> results = Lists.newArrayList(c.sideInput(resultsView));
                LOG.debug("Side input initialized to finalize write operation {}.", writeOperation);
                // We must always output at least 1 shard, and honor user-specified numShards if
                // set.
                int minShardsNeeded;
                if (numShardsView != null) {
                    minShardsNeeded = c.sideInput(numShardsView);
                } else if (numShardsProvider != null) {
                    minShardsNeeded = numShardsProvider.get();
                } else {
                    minShardsNeeded = 1;
                }
                int extraShardsNeeded = minShardsNeeded - results.size();
                if (extraShardsNeeded > 0) {
                    LOG.info("Creating {} empty output shards in addition to {} written for a total of {}.", extraShardsNeeded, results.size(), minShardsNeeded);
                    for (int i = 0; i < extraShardsNeeded; ++i) {
                        Writer<T> writer = writeOperation.createWriter();
                        writer.openUnwindowed(UUID.randomUUID().toString(), UNKNOWN_SHARDNUM);
                        FileResult emptyWrite = writer.close();
                        results.add(emptyWrite);
                    }
                    LOG.debug("Done creating extra shards.");
                }
                writeOperation.finalize(results);
                LOG.debug("Done finalizing write operation {}", writeOperation);
            }
        }).withSideInputs(sideInputs.build()));
    }
    return PDone.in(input.getPipeline());
}
Also used : ImmutableList(com.google.common.collect.ImmutableList) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) ImmutableList(com.google.common.collect.ImmutableList) List(java.util.List) Coder(org.apache.beam.sdk.coders.Coder) KvCoder(org.apache.beam.sdk.coders.KvCoder) FileResultCoder(org.apache.beam.sdk.io.FileBasedSink.FileResultCoder) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) GlobalWindows(org.apache.beam.sdk.transforms.windowing.GlobalWindows) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFn(org.apache.beam.sdk.transforms.DoFn) FileResult(org.apache.beam.sdk.io.FileBasedSink.FileResult) Writer(org.apache.beam.sdk.io.FileBasedSink.Writer)

Example 62 with Coder

use of org.apache.beam.sdk.coders.Coder in project beam by apache.

the class FlinkKeyGroupStateInternals method snapshotKeyGroupState.

/**
   * Snapshots the state {@code (stateName -> (valueCoder && (namespace -> value)))} for a given
   * {@code keyGroupIdx}.
   *
   * @param keyGroupIdx the id of the key-group to be put in the snapshot.
   * @param out the stream to write to.
   */
public void snapshotKeyGroupState(int keyGroupIdx, DataOutputStream out) throws Exception {
    int localIdx = getIndexForKeyGroup(keyGroupIdx);
    Map<String, Tuple2<Coder<?>, Map<String, ?>>> stateTable = stateTables[localIdx];
    Preconditions.checkState(stateTable.size() <= Short.MAX_VALUE, "Too many States: " + stateTable.size() + ". Currently at most " + Short.MAX_VALUE + " states are supported");
    out.writeShort(stateTable.size());
    for (Map.Entry<String, Tuple2<Coder<?>, Map<String, ?>>> entry : stateTable.entrySet()) {
        out.writeUTF(entry.getKey());
        Coder coder = entry.getValue().f0;
        InstantiationUtil.serializeObject(out, coder);
        Map<String, ?> map = entry.getValue().f1;
        out.writeInt(map.size());
        for (Map.Entry<String, ?> entry1 : map.entrySet()) {
            StringUtf8Coder.of().encode(entry1.getKey(), out);
            coder.encode(entry1.getValue(), out);
        }
    }
}
Also used : Coder(org.apache.beam.sdk.coders.Coder) ListCoder(org.apache.beam.sdk.coders.ListCoder) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) Tuple2(org.apache.flink.api.java.tuple.Tuple2) HashMap(java.util.HashMap) Map(java.util.Map)

Example 63 with Coder

use of org.apache.beam.sdk.coders.Coder in project beam by apache.

the class GroupIntoBatches method expand.

@Override
public PCollection<KV<K, Iterable<InputT>>> expand(PCollection<KV<K, InputT>> input) {
    Duration allowedLateness = input.getWindowingStrategy().getAllowedLateness();
    checkArgument(input.getCoder() instanceof KvCoder, "coder specified in the input PCollection is not a KvCoder");
    KvCoder inputCoder = (KvCoder) input.getCoder();
    Coder<K> keyCoder = (Coder<K>) inputCoder.getCoderArguments().get(0);
    Coder<InputT> valueCoder = (Coder<InputT>) inputCoder.getCoderArguments().get(1);
    return input.apply(ParDo.of(new GroupIntoBatchesDoFn<>(batchSize, allowedLateness, keyCoder, valueCoder)));
}
Also used : KvCoder(org.apache.beam.sdk.coders.KvCoder) Coder(org.apache.beam.sdk.coders.Coder) KvCoder(org.apache.beam.sdk.coders.KvCoder) Duration(org.joda.time.Duration)

Example 64 with Coder

use of org.apache.beam.sdk.coders.Coder in project beam by apache.

the class ApexParDoOperator method fireTimer.

@Override
public void fireTimer(Object key, Collection<TimerData> timerDataSet) {
    pushbackDoFnRunner.startBundle();
    @SuppressWarnings("unchecked") Coder<Object> keyCoder = (Coder) currentKeyStateInternals.getKeyCoder();
    ((StateInternalsProxy) currentKeyStateInternals).setKey(key);
    currentKeyTimerInternals.setContext(key, keyCoder, new Instant(this.currentInputWatermark), new Instant(this.currentOutputWatermark));
    for (TimerData timerData : timerDataSet) {
        StateNamespace namespace = timerData.getNamespace();
        checkArgument(namespace instanceof WindowNamespace);
        BoundedWindow window = ((WindowNamespace<?>) namespace).getWindow();
        pushbackDoFnRunner.onTimer(timerData.getTimerId(), window, timerData.getTimestamp(), timerData.getDomain());
    }
    pushbackDoFnRunner.finishBundle();
}
Also used : WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) KeyedWorkItemCoder(org.apache.beam.runners.core.KeyedWorkItemCoder) ListCoder(org.apache.beam.sdk.coders.ListCoder) KvCoder(org.apache.beam.sdk.coders.KvCoder) Coder(org.apache.beam.sdk.coders.Coder) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) StateInternalsProxy(org.apache.beam.runners.apex.translation.utils.StateInternalsProxy) Instant(org.joda.time.Instant) WindowNamespace(org.apache.beam.runners.core.StateNamespaces.WindowNamespace) TimerData(org.apache.beam.runners.core.TimerInternals.TimerData) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) StateNamespace(org.apache.beam.runners.core.StateNamespace)

Example 65 with Coder

use of org.apache.beam.sdk.coders.Coder in project beam by apache.

the class ApexParDoOperator method setup.

@Override
public void setup(OperatorContext context) {
    this.traceTuples = ApexStreamTuple.Logging.isDebugEnabled(pipelineOptions.get(), this);
    SideInputReader sideInputReader = NullSideInputReader.of(sideInputs);
    if (!sideInputs.isEmpty()) {
        sideInputHandler = new SideInputHandler(sideInputs, sideInputStateInternals);
        sideInputReader = sideInputHandler;
    }
    for (int i = 0; i < additionalOutputTags.size(); i++) {
        @SuppressWarnings("unchecked") DefaultOutputPort<ApexStreamTuple<?>> port = (DefaultOutputPort<ApexStreamTuple<?>>) additionalOutputPorts[i];
        additionalOutputPortMapping.put(additionalOutputTags.get(i), port);
    }
    NoOpStepContext stepContext = new NoOpStepContext() {

        @Override
        public StateInternals stateInternals() {
            return currentKeyStateInternals;
        }

        @Override
        public TimerInternals timerInternals() {
            return currentKeyTimerInternals;
        }
    };
    DoFnRunner<InputT, OutputT> doFnRunner = DoFnRunners.simpleRunner(pipelineOptions.get(), doFn, sideInputReader, this, mainOutputTag, additionalOutputTags, stepContext, windowingStrategy);
    doFnInvoker = DoFnInvokers.invokerFor(doFn);
    doFnInvoker.invokeSetup();
    if (this.currentKeyStateInternals != null) {
        StatefulDoFnRunner.CleanupTimer cleanupTimer = new StatefulDoFnRunner.TimeInternalsCleanupTimer(stepContext.timerInternals(), windowingStrategy);
        @SuppressWarnings({ "rawtypes" }) Coder windowCoder = windowingStrategy.getWindowFn().windowCoder();
        @SuppressWarnings({ "unchecked" }) StatefulDoFnRunner.StateCleaner<?> stateCleaner = new StatefulDoFnRunner.StateInternalsStateCleaner<>(doFn, stepContext.stateInternals(), windowCoder);
        doFnRunner = DoFnRunners.defaultStatefulDoFnRunner(doFn, doFnRunner, windowingStrategy, cleanupTimer, stateCleaner);
    }
    pushbackDoFnRunner = SimplePushbackSideInputDoFnRunner.create(doFnRunner, sideInputs, sideInputHandler);
    if (doFn instanceof ProcessFn) {
        @SuppressWarnings("unchecked") StateInternalsFactory<String> stateInternalsFactory = (StateInternalsFactory<String>) this.currentKeyStateInternals.getFactory();
        @SuppressWarnings({ "rawtypes", "unchecked" }) ProcessFn<InputT, OutputT, Object, RestrictionTracker<Object>> splittableDoFn = (ProcessFn) doFn;
        splittableDoFn.setStateInternalsFactory(stateInternalsFactory);
        TimerInternalsFactory<String> timerInternalsFactory = new TimerInternalsFactory<String>() {

            @Override
            public TimerInternals timerInternalsForKey(String key) {
                return currentKeyTimerInternals;
            }
        };
        splittableDoFn.setTimerInternalsFactory(timerInternalsFactory);
        splittableDoFn.setProcessElementInvoker(new OutputAndTimeBoundedSplittableProcessElementInvoker<>(doFn, pipelineOptions.get(), new OutputWindowedValue<OutputT>() {

            @Override
            public void outputWindowedValue(OutputT output, Instant timestamp, Collection<? extends BoundedWindow> windows, PaneInfo pane) {
                output(mainOutputTag, WindowedValue.of(output, timestamp, windows, pane));
            }

            @Override
            public <AdditionalOutputT> void outputWindowedValue(TupleTag<AdditionalOutputT> tag, AdditionalOutputT output, Instant timestamp, Collection<? extends BoundedWindow> windows, PaneInfo pane) {
                output(tag, WindowedValue.of(output, timestamp, windows, pane));
            }
        }, sideInputReader, Executors.newSingleThreadScheduledExecutor(Executors.defaultThreadFactory()), 10000, Duration.standardSeconds(10)));
    }
}
Also used : RestrictionTracker(org.apache.beam.sdk.transforms.splittabledofn.RestrictionTracker) ApexStreamTuple(org.apache.beam.runners.apex.translation.utils.ApexStreamTuple) ProcessFn(org.apache.beam.runners.core.SplittableParDoViaKeyedWorkItems.ProcessFn) SideInputHandler(org.apache.beam.runners.core.SideInputHandler) TupleTag(org.apache.beam.sdk.values.TupleTag) SideInputReader(org.apache.beam.runners.core.SideInputReader) NullSideInputReader(org.apache.beam.runners.core.NullSideInputReader) NoOpStepContext(org.apache.beam.runners.apex.translation.utils.NoOpStepContext) PaneInfo(org.apache.beam.sdk.transforms.windowing.PaneInfo) StatefulDoFnRunner(org.apache.beam.runners.core.StatefulDoFnRunner) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) DefaultOutputPort(com.datatorrent.api.DefaultOutputPort) WindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.WindowedValueCoder) KeyedWorkItemCoder(org.apache.beam.runners.core.KeyedWorkItemCoder) ListCoder(org.apache.beam.sdk.coders.ListCoder) KvCoder(org.apache.beam.sdk.coders.KvCoder) Coder(org.apache.beam.sdk.coders.Coder) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) VoidCoder(org.apache.beam.sdk.coders.VoidCoder) OutputWindowedValue(org.apache.beam.runners.core.OutputWindowedValue) TimerInternalsFactory(org.apache.beam.runners.core.TimerInternalsFactory) Instant(org.joda.time.Instant) StateInternalsFactory(org.apache.beam.runners.core.StateInternalsFactory) Collection(java.util.Collection)

Aggregations

Coder (org.apache.beam.sdk.coders.Coder)119 KvCoder (org.apache.beam.sdk.coders.KvCoder)75 WindowedValue (org.apache.beam.sdk.util.WindowedValue)55 StringUtf8Coder (org.apache.beam.sdk.coders.StringUtf8Coder)44 Test (org.junit.Test)43 HashMap (java.util.HashMap)42 ArrayList (java.util.ArrayList)38 Map (java.util.Map)36 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)35 List (java.util.List)32 KV (org.apache.beam.sdk.values.KV)30 RunnerApi (org.apache.beam.model.pipeline.v1.RunnerApi)28 IterableCoder (org.apache.beam.sdk.coders.IterableCoder)28 PCollection (org.apache.beam.sdk.values.PCollection)28 TupleTag (org.apache.beam.sdk.values.TupleTag)24 ByteString (org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString)23 IOException (java.io.IOException)22 PCollectionView (org.apache.beam.sdk.values.PCollectionView)22 Instant (org.joda.time.Instant)21 WindowingStrategy (org.apache.beam.sdk.values.WindowingStrategy)20