Search in sources :

Example 11 with DoFnSchemaInformation

use of org.apache.beam.sdk.transforms.DoFnSchemaInformation in project beam by apache.

the class ParDoTranslatorBatch method translateTransform.

@Override
public void translateTransform(PTransform<PCollection<InputT>, PCollectionTuple> transform, AbstractTranslationContext context) {
    String stepName = context.getCurrentTransform().getFullName();
    // Check for not supported advanced features
    // TODO: add support of Splittable DoFn
    DoFn<InputT, OutputT> doFn = getDoFn(context);
    checkState(!DoFnSignatures.isSplittable(doFn), "Not expected to directly translate splittable DoFn, should have been overridden: %s", doFn);
    // TODO: add support of states and timers
    checkState(!DoFnSignatures.isStateful(doFn), "States and timers are not supported for the moment.");
    checkState(!DoFnSignatures.requiresTimeSortedInput(doFn), "@RequiresTimeSortedInput is not " + "supported for the moment");
    DoFnSchemaInformation doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
    // Init main variables
    PValue input = context.getInput();
    Dataset<WindowedValue<InputT>> inputDataSet = context.getDataset(input);
    Map<TupleTag<?>, PCollection<?>> outputs = context.getOutputs();
    TupleTag<?> mainOutputTag = getTupleTag(context);
    List<TupleTag<?>> outputTags = new ArrayList<>(outputs.keySet());
    WindowingStrategy<?, ?> windowingStrategy = ((PCollection<InputT>) input).getWindowingStrategy();
    Coder<InputT> inputCoder = ((PCollection<InputT>) input).getCoder();
    Coder<? extends BoundedWindow> windowCoder = windowingStrategy.getWindowFn().windowCoder();
    // construct a map from side input to WindowingStrategy so that
    // the DoFn runner can map main-input windows to side input windows
    List<PCollectionView<?>> sideInputs = getSideInputs(context);
    Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputStrategies = new HashMap<>();
    for (PCollectionView<?> sideInput : sideInputs) {
        sideInputStrategies.put(sideInput, sideInput.getPCollection().getWindowingStrategy());
    }
    SideInputBroadcast broadcastStateData = createBroadcastSideInputs(sideInputs, context);
    Map<TupleTag<?>, Coder<?>> outputCoderMap = context.getOutputCoders();
    MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();
    List<TupleTag<?>> additionalOutputTags = new ArrayList<>();
    for (TupleTag<?> tag : outputTags) {
        if (!tag.equals(mainOutputTag)) {
            additionalOutputTags.add(tag);
        }
    }
    Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
    @SuppressWarnings("unchecked") DoFnFunction<InputT, OutputT> doFnWrapper = new DoFnFunction(metricsAccum, stepName, doFn, windowingStrategy, sideInputStrategies, context.getSerializableOptions(), additionalOutputTags, mainOutputTag, inputCoder, outputCoderMap, broadcastStateData, doFnSchemaInformation, sideInputMapping);
    MultiOutputCoder multipleOutputCoder = MultiOutputCoder.of(SerializableCoder.of(TupleTag.class), outputCoderMap, windowCoder);
    Dataset<Tuple2<TupleTag<?>, WindowedValue<?>>> allOutputs = inputDataSet.mapPartitions(doFnWrapper, EncoderHelpers.fromBeamCoder(multipleOutputCoder));
    if (outputs.entrySet().size() > 1) {
        allOutputs.persist();
        for (Map.Entry<TupleTag<?>, PCollection<?>> output : outputs.entrySet()) {
            pruneOutputFilteredByTag(context, allOutputs, output, windowCoder);
        }
    } else {
        Coder<OutputT> outputCoder = ((PCollection<OutputT>) outputs.get(mainOutputTag)).getCoder();
        Coder<WindowedValue<?>> windowedValueCoder = (Coder<WindowedValue<?>>) (Coder<?>) WindowedValue.getFullCoder(outputCoder, windowCoder);
        Dataset<WindowedValue<?>> outputDataset = allOutputs.map((MapFunction<Tuple2<TupleTag<?>, WindowedValue<?>>, WindowedValue<?>>) value -> value._2, EncoderHelpers.fromBeamCoder(windowedValueCoder));
        context.putDatasetWildcard(outputs.entrySet().iterator().next().getValue(), outputDataset);
    }
}
Also used : SideInputBroadcast(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.SideInputBroadcast) SerializableCoder(org.apache.beam.sdk.coders.SerializableCoder) WindowedValue(org.apache.beam.sdk.util.WindowedValue) Dataset(org.apache.spark.sql.Dataset) JavaSparkContext(org.apache.spark.api.java.JavaSparkContext) Coder(org.apache.beam.sdk.coders.Coder) HashMap(java.util.HashMap) AbstractTranslationContext(org.apache.beam.runners.spark.structuredstreaming.translation.AbstractTranslationContext) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) ArrayList(java.util.ArrayList) PTransform(org.apache.beam.sdk.transforms.PTransform) DoFnSignatures(org.apache.beam.sdk.transforms.reflect.DoFnSignatures) EncoderHelpers(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.EncoderHelpers) TupleTag(org.apache.beam.sdk.values.TupleTag) Map(java.util.Map) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) MultiOutputCoder(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.MultiOutputCoder) CoderHelpers(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.CoderHelpers) MapFunction(org.apache.spark.api.java.function.MapFunction) ParDoTranslation(org.apache.beam.runners.core.construction.ParDoTranslation) DoFn(org.apache.beam.sdk.transforms.DoFn) MetricsAccumulator(org.apache.beam.runners.spark.structuredstreaming.metrics.MetricsAccumulator) IOException(java.io.IOException) PCollection(org.apache.beam.sdk.values.PCollection) Tuple2(scala.Tuple2) List(java.util.List) PValue(org.apache.beam.sdk.values.PValue) Preconditions.checkState(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState) PCollectionView(org.apache.beam.sdk.values.PCollectionView) TransformTranslator(org.apache.beam.runners.spark.structuredstreaming.translation.TransformTranslator) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) MetricsContainerStepMapAccumulator(org.apache.beam.runners.spark.structuredstreaming.metrics.MetricsContainerStepMapAccumulator) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) FilterFunction(org.apache.spark.api.java.function.FilterFunction) HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TupleTag(org.apache.beam.sdk.values.TupleTag) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) WindowedValue(org.apache.beam.sdk.util.WindowedValue) SerializableCoder(org.apache.beam.sdk.coders.SerializableCoder) Coder(org.apache.beam.sdk.coders.Coder) MultiOutputCoder(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.MultiOutputCoder) SideInputBroadcast(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.SideInputBroadcast) MultiOutputCoder(org.apache.beam.runners.spark.structuredstreaming.translation.helpers.MultiOutputCoder) PValue(org.apache.beam.sdk.values.PValue) MetricsContainerStepMapAccumulator(org.apache.beam.runners.spark.structuredstreaming.metrics.MetricsContainerStepMapAccumulator) PCollection(org.apache.beam.sdk.values.PCollection) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) Tuple2(scala.Tuple2) HashMap(java.util.HashMap) Map(java.util.Map)

Example 12 with DoFnSchemaInformation

use of org.apache.beam.sdk.transforms.DoFnSchemaInformation in project beam by apache.

the class TransformTranslator method parDo.

private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
    return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {

        @Override
        @SuppressWarnings("unchecked")
        public void evaluate(ParDo.MultiOutput<InputT, OutputT> transform, EvaluationContext context) {
            String stepName = context.getCurrentTransform().getFullName();
            DoFn<InputT, OutputT> doFn = transform.getFn();
            checkState(!DoFnSignatures.signatureForDoFn(doFn).processElement().isSplittable(), "Not expected to directly translate splittable DoFn, should have been overridden: %s", doFn);
            JavaRDD<WindowedValue<InputT>> inRDD = ((BoundedDataset<InputT>) context.borrowDataset(transform)).getRDD();
            WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
            MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();
            Coder<InputT> inputCoder = (Coder<InputT>) context.getInput(transform).getCoder();
            Map<TupleTag<?>, Coder<?>> outputCoders = context.getOutputCoders();
            JavaPairRDD<TupleTag<?>, WindowedValue<?>> all;
            DoFnSignature signature = DoFnSignatures.getSignature(transform.getFn().getClass());
            boolean stateful = signature.stateDeclarations().size() > 0 || signature.timerDeclarations().size() > 0;
            DoFnSchemaInformation doFnSchemaInformation;
            doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
            Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
            MultiDoFnFunction<InputT, OutputT> multiDoFnFunction = new MultiDoFnFunction<>(metricsAccum, stepName, doFn, context.getSerializableOptions(), transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), inputCoder, outputCoders, TranslationUtils.getSideInputs(transform.getSideInputs().values(), context), windowingStrategy, stateful, doFnSchemaInformation, sideInputMapping);
            if (stateful) {
                // Based on the fact that the signature is stateful, DoFnSignatures ensures
                // that it is also keyed
                all = statefulParDoTransform((KvCoder) context.getInput(transform).getCoder(), windowingStrategy.getWindowFn().windowCoder(), (JavaRDD) inRDD, getPartitioner(context), (MultiDoFnFunction) multiDoFnFunction, signature.processElement().requiresTimeSortedInput());
            } else {
                all = inRDD.mapPartitionsToPair(multiDoFnFunction);
            }
            Map<TupleTag<?>, PCollection<?>> outputs = context.getOutputs(transform);
            if (outputs.size() > 1) {
                StorageLevel level = StorageLevel.fromString(context.storageLevel());
                if (canAvoidRddSerialization(level)) {
                    // if it is memory only reduce the overhead of moving to bytes
                    all = all.persist(level);
                } else {
                    // Caching can cause Serialization, we need to code to bytes
                    // more details in https://issues.apache.org/jira/browse/BEAM-2669
                    Map<TupleTag<?>, Coder<WindowedValue<?>>> coderMap = TranslationUtils.getTupleTagCoders(outputs);
                    all = all.mapToPair(TranslationUtils.getTupleTagEncodeFunction(coderMap)).persist(level).mapToPair(TranslationUtils.getTupleTagDecodeFunction(coderMap));
                }
            }
            for (Map.Entry<TupleTag<?>, PCollection<?>> output : outputs.entrySet()) {
                JavaPairRDD<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
                // Object is the best we can do since different outputs can have different tags
                JavaRDD<WindowedValue<Object>> values = (JavaRDD<WindowedValue<Object>>) (JavaRDD<?>) filtered.values();
                context.putDataset(output.getValue(), new BoundedDataset<>(values));
            }
        }

        @Override
        public String toNativeString() {
            return "mapPartitions(new <fn>())";
        }
    };
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) WindowedValue(org.apache.beam.sdk.util.WindowedValue) StorageLevel(org.apache.spark.storage.StorageLevel) KvCoder(org.apache.beam.sdk.coders.KvCoder) Coder(org.apache.beam.sdk.coders.Coder) IterableCoder(org.apache.beam.sdk.coders.IterableCoder) ByteArrayCoder(org.apache.beam.sdk.coders.ByteArrayCoder) KvCoder(org.apache.beam.sdk.coders.KvCoder) MetricsContainerStepMapAccumulator(org.apache.beam.runners.spark.metrics.MetricsContainerStepMapAccumulator) JavaRDD(org.apache.spark.api.java.JavaRDD) PCollection(org.apache.beam.sdk.values.PCollection) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) ParDo(org.apache.beam.sdk.transforms.ParDo) SplittableParDo(org.apache.beam.runners.core.construction.SplittableParDo) Map(java.util.Map) HashMap(java.util.HashMap) DoFnSignature(org.apache.beam.sdk.transforms.reflect.DoFnSignature)

Example 13 with DoFnSchemaInformation

use of org.apache.beam.sdk.transforms.DoFnSchemaInformation in project beam by apache.

the class ParDoMultiOutputTranslatorBatch method translateNode.

@Override
public void translateNode(ParDo.MultiOutput<InputT, OutputT> transform, Twister2BatchTranslationContext context) {
    DoFn<InputT, OutputT> doFn;
    doFn = transform.getFn();
    if (DoFnSignatures.signatureForDoFn(doFn).processElement().isSplittable()) {
        throw new UnsupportedOperationException(String.format("Not expected to directly translate splittable DoFn, should have been overridden: %s", doFn));
    }
    BatchTSetImpl<WindowedValue<InputT>> inputTTSet = context.getInputDataSet(context.getInput(transform));
    WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
    Coder<InputT> inputCoder = (Coder<InputT>) context.getInput(transform).getCoder();
    Map<String, PCollectionView<?>> sideInputMapping;
    Map<TupleTag<?>, PCollection<?>> outputs = context.getOutputs();
    Map<TupleTag<?>, Coder<?>> outputCoders = context.getOutputCoders();
    // DoFnSignature signature = DoFnSignatures.getSignature(transform.getFn().getClass());
    DoFnSchemaInformation doFnSchemaInformation;
    doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
    sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
    TupleTag<OutputT> mainOutput = transform.getMainOutputTag();
    List<TupleTag<?>> additionalOutputTags = new ArrayList<>(transform.getAdditionalOutputTags().getAll());
    Map<String, PCollectionView<?>> sideInputs = transform.getSideInputs();
    // TODO : note change from List to map in sideinputs
    // construct a map from side input to WindowingStrategy so that
    // the DoFn runner can map main-input windows to side input windows
    Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputStrategies = new HashMap<>();
    for (PCollectionView<?> sideInput : sideInputs.values()) {
        sideInputStrategies.put(sideInput, sideInput.getWindowingStrategyInternal());
    }
    TupleTag<?> mainOutputTag;
    try {
        mainOutputTag = ParDoTranslation.getMainOutputTag(context.getCurrentTransform());
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    Map<TupleTag<?>, Integer> outputMap = Maps.newHashMap();
    outputMap.put(mainOutputTag, 0);
    int count = 1;
    for (TupleTag<?> tag : outputs.keySet()) {
        if (!outputMap.containsKey(tag)) {
            outputMap.put(tag, count++);
        }
    }
    ComputeTSet<RawUnionValue, Iterator<WindowedValue<InputT>>> outputTSet = inputTTSet.direct().<RawUnionValue>compute(new DoFnFunction<OutputT, InputT>(context, doFn, inputCoder, outputCoders, additionalOutputTags, windowingStrategy, sideInputStrategies, mainOutput, doFnSchemaInformation, outputMap, sideInputMapping));
    for (Map.Entry<TupleTag<?>, PCollection<?>> output : outputs.entrySet()) {
        ComputeTSet<WindowedValue<OutputT>, Iterator<RawUnionValue>> tempTSet = outputTSet.direct().compute(new OutputTagFilter(outputMap.get(output.getKey())));
        context.setOutputDataSet((PCollection) output.getValue(), tempTSet);
    }
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TupleTag(org.apache.beam.sdk.values.TupleTag) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) WindowedValue(org.apache.beam.sdk.util.WindowedValue) Iterator(java.util.Iterator) OutputTagFilter(org.apache.beam.runners.twister2.translators.functions.OutputTagFilter) Coder(org.apache.beam.sdk.coders.Coder) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) IOException(java.io.IOException) PCollection(org.apache.beam.sdk.values.PCollection) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) HashMap(java.util.HashMap) Map(java.util.Map)

Example 14 with DoFnSchemaInformation

use of org.apache.beam.sdk.transforms.DoFnSchemaInformation in project twister2 by DSC-SPIDAL.

the class ParDoMultiOutputTranslatorBatch method translateNode.

@Override
public void translateNode(ParDo.MultiOutput<IT, OT> transform, Twister2BatchTranslationContext context) {
    DoFn<IT, OT> doFn;
    doFn = transform.getFn();
    BatchTSetImpl<WindowedValue<IT>> inputTTSet = context.getInputDataSet(context.getInput(transform));
    WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
    Coder<IT> inputCoder = (Coder<IT>) context.getInput(transform).getCoder();
    Map<TupleTag<?>, PValue> outputs = context.getOutputs();
    Map<TupleTag<?>, Coder<?>> outputCoders = context.getOutputCoders();
    DoFnSignature signature = DoFnSignatures.getSignature(transform.getFn().getClass());
    DoFnSchemaInformation doFnSchemaInformation;
    doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
    TupleTag<OT> mainOutput = transform.getMainOutputTag();
    List<TupleTag<?>> additionalOutputTags = new ArrayList<>(outputs.size() - 1);
    Collection<PCollectionView<?>> sideInputs = transform.getSideInputs();
    // construct a map from side input to WindowingStrategy so that
    // the DoFn runner can map main-input windows to side input windows
    Map<PCollectionView<?>, WindowingStrategy<?, ?>> sideInputStrategies = new HashMap<>();
    for (PCollectionView<?> sideInput : sideInputs) {
        sideInputStrategies.put(sideInput, sideInput.getWindowingStrategyInternal());
    }
    TupleTag<?> mainOutputTag;
    try {
        mainOutputTag = ParDoTranslation.getMainOutputTag(context.getCurrentTransform());
    } catch (IOException e) {
        throw new RuntimeException(e);
    }
    Map<TupleTag<?>, Integer> outputMap = Maps.newHashMap();
    // put the main output at index 0, FlinkMultiOutputDoFnFunction  expects this
    outputMap.put(mainOutputTag, 0);
    int count = 1;
    for (TupleTag<?> tag : outputs.keySet()) {
        if (!outputMap.containsKey(tag)) {
            outputMap.put(tag, count++);
        }
    }
    ComputeTSet<RawUnionValue> outputTSet = inputTTSet.direct().<RawUnionValue>compute(new DoFnFunction<OT, IT>(context, doFn, inputCoder, outputCoders, additionalOutputTags, windowingStrategy, sideInputStrategies, mainOutput, doFnSchemaInformation, outputMap));
    for (Map.Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
        ComputeTSet<WindowedValue<OT>> tempTSet = outputTSet.direct().compute(new OutputTagFilter<>(outputMap.get(output.getKey())));
        context.setOutputDataSet((PCollection) output.getValue(), tempTSet);
    }
}
Also used : HashMap(java.util.HashMap) ArrayList(java.util.ArrayList) TupleTag(org.apache.beam.sdk.values.TupleTag) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) WindowedValue(org.apache.beam.sdk.util.WindowedValue) Coder(org.apache.beam.sdk.coders.Coder) RawUnionValue(org.apache.beam.sdk.transforms.join.RawUnionValue) IOException(java.io.IOException) PValue(org.apache.beam.sdk.values.PValue) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) HashMap(java.util.HashMap) Map(java.util.Map) DoFnSignature(org.apache.beam.sdk.transforms.reflect.DoFnSignature)

Aggregations

DoFnSchemaInformation (org.apache.beam.sdk.transforms.DoFnSchemaInformation)14 PCollection (org.apache.beam.sdk.values.PCollection)12 HashMap (java.util.HashMap)10 Map (java.util.Map)10 Coder (org.apache.beam.sdk.coders.Coder)10 PCollectionView (org.apache.beam.sdk.values.PCollectionView)10 TupleTag (org.apache.beam.sdk.values.TupleTag)10 ParDo (org.apache.beam.sdk.transforms.ParDo)9 WindowedValue (org.apache.beam.sdk.util.WindowedValue)8 IOException (java.io.IOException)7 ArrayList (java.util.ArrayList)7 WindowingStrategy (org.apache.beam.sdk.values.WindowingStrategy)7 List (java.util.List)6 Collectors (java.util.stream.Collectors)5 KvCoder (org.apache.beam.sdk.coders.KvCoder)5 DoFn (org.apache.beam.sdk.transforms.DoFn)5 RawUnionValue (org.apache.beam.sdk.transforms.join.RawUnionValue)5 DoFnSignature (org.apache.beam.sdk.transforms.reflect.DoFnSignature)5 DoFnSignatures (org.apache.beam.sdk.transforms.reflect.DoFnSignatures)5 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)5