Search in sources :

Example 21 with DoFnSignature

use of org.apache.beam.sdk.transforms.reflect.DoFnSignature in project beam by apache.

the class PTransformMatchers method splittableProcessKeyedBounded.

/**
 * A {@link PTransformMatcher} that matches a {@link ParDo.SingleOutput} containing a {@link DoFn}
 * that is splittable, as signified by {@link ProcessElementMethod#isSplittable()}.
 */
public static PTransformMatcher splittableProcessKeyedBounded() {
    return new PTransformMatcher() {

        @Override
        public boolean matches(AppliedPTransform<?, ?, ?> application) {
            PTransform<?, ?> transform = application.getTransform();
            if (transform instanceof SplittableParDo.ProcessKeyedElements) {
                DoFn<?, ?> fn = ((SplittableParDo.ProcessKeyedElements) transform).getFn();
                DoFnSignature signature = DoFnSignatures.signatureForDoFn(fn);
                return signature.processElement().isSplittable() && signature.isBoundedPerElement() == IsBounded.BOUNDED;
            }
            return false;
        }

        @Override
        public String toString() {
            return MoreObjects.toStringHelper("SplittableProcessKeyedBoundedMatcher").toString();
        }
    };
}
Also used : PTransformMatcher(org.apache.beam.sdk.runners.PTransformMatcher) AppliedPTransform(org.apache.beam.sdk.runners.AppliedPTransform) DoFnSignature(org.apache.beam.sdk.transforms.reflect.DoFnSignature)

Example 22 with DoFnSignature

use of org.apache.beam.sdk.transforms.reflect.DoFnSignature in project beam by apache.

the class DoFnOperator method earlyBindStateIfNeeded.

private void earlyBindStateIfNeeded() throws IllegalArgumentException, IllegalAccessException {
    if (keyCoder != null) {
        if (doFn != null) {
            DoFnSignature signature = DoFnSignatures.getSignature(doFn.getClass());
            FlinkStateInternals.EarlyBinder earlyBinder = new FlinkStateInternals.EarlyBinder(getKeyedStateBackend(), serializedOptions);
            for (DoFnSignature.StateDeclaration value : signature.stateDeclarations().values()) {
                StateSpec<?> spec = (StateSpec<?>) signature.stateDeclarations().get(value.id()).field().get(doFn);
                spec.bind(value.id(), earlyBinder);
            }
            if (doFnRunner instanceof StatefulDoFnRunner) {
                ((StatefulDoFnRunner<InputT, OutputT, BoundedWindow>) doFnRunner).getSystemStateTags().forEach(tag -> tag.getSpec().bind(tag.getId(), earlyBinder));
            }
        }
    }
}
Also used : StateSpec(org.apache.beam.sdk.state.StateSpec) FlinkStateInternals(org.apache.beam.runners.flink.translation.wrappers.streaming.state.FlinkStateInternals) StatefulDoFnRunner(org.apache.beam.runners.core.StatefulDoFnRunner) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) DoFnSignature(org.apache.beam.sdk.transforms.reflect.DoFnSignature)

Example 23 with DoFnSignature

use of org.apache.beam.sdk.transforms.reflect.DoFnSignature in project beam by apache.

the class BatchStatefulParDoOverrides method verifyFnIsStateful.

private static <InputT, OutputT> void verifyFnIsStateful(DoFn<InputT, OutputT> fn) {
    DoFnSignature signature = DoFnSignatures.getSignature(fn.getClass());
    // It is still correct to use this without state or timers, but a bad idea.
    // Since it is internal it should never be used wrong, so it is OK to crash.
    checkState(signature.usesState() || signature.usesTimers(), "%s used for %s that does not use state or timers.", BatchStatefulParDoOverrides.class.getSimpleName(), ParDo.class.getSimpleName());
}
Also used : ParDo(org.apache.beam.sdk.transforms.ParDo) DoFnSignature(org.apache.beam.sdk.transforms.reflect.DoFnSignature)

Example 24 with DoFnSignature

use of org.apache.beam.sdk.transforms.reflect.DoFnSignature in project beam by apache.

the class SamzaDoFnRunners method create.

/**
 * Create DoFnRunner for java runner.
 */
public static <InT, FnOutT> DoFnRunner<InT, FnOutT> create(SamzaPipelineOptions pipelineOptions, DoFn<InT, FnOutT> doFn, WindowingStrategy<?, ?> windowingStrategy, String transformFullName, String transformId, Context context, TupleTag<FnOutT> mainOutputTag, SideInputHandler sideInputHandler, SamzaTimerInternalsFactory<?> timerInternalsFactory, Coder<?> keyCoder, DoFnRunners.OutputManager outputManager, Coder<InT> inputCoder, List<TupleTag<?>> sideOutputTags, Map<TupleTag<?>, Coder<?>> outputCoders, DoFnSchemaInformation doFnSchemaInformation, Map<String, PCollectionView<?>> sideInputMapping) {
    final KeyedInternals keyedInternals;
    final TimerInternals timerInternals;
    final StateInternals stateInternals;
    final DoFnSignature signature = DoFnSignatures.getSignature(doFn.getClass());
    final SamzaStoreStateInternals.Factory<?> stateInternalsFactory = SamzaStoreStateInternals.createStateInternalsFactory(transformId, keyCoder, context.getTaskContext(), pipelineOptions, signature);
    final SamzaExecutionContext executionContext = (SamzaExecutionContext) context.getApplicationContainerContext();
    if (StateUtils.isStateful(doFn)) {
        keyedInternals = new KeyedInternals(stateInternalsFactory, timerInternalsFactory);
        stateInternals = keyedInternals.stateInternals();
        timerInternals = keyedInternals.timerInternals();
    } else {
        keyedInternals = null;
        stateInternals = stateInternalsFactory.stateInternalsForKey(null);
        timerInternals = timerInternalsFactory.timerInternalsForKey(null);
    }
    final StepContext stepContext = createStepContext(stateInternals, timerInternals);
    final DoFnRunner<InT, FnOutT> underlyingRunner = DoFnRunners.simpleRunner(pipelineOptions, doFn, sideInputHandler, outputManager, mainOutputTag, sideOutputTags, stepContext, inputCoder, outputCoders, windowingStrategy, doFnSchemaInformation, sideInputMapping);
    final DoFnRunner<InT, FnOutT> doFnRunnerWithMetrics = pipelineOptions.getEnableMetrics() ? DoFnRunnerWithMetrics.wrap(underlyingRunner, executionContext.getMetricsContainer(), transformFullName) : underlyingRunner;
    if (keyedInternals != null) {
        final DoFnRunner<InT, FnOutT> statefulDoFnRunner = DoFnRunners.defaultStatefulDoFnRunner(doFn, inputCoder, doFnRunnerWithMetrics, stepContext, windowingStrategy, new StatefulDoFnRunner.TimeInternalsCleanupTimer(timerInternals, windowingStrategy), createStateCleaner(doFn, windowingStrategy, keyedInternals.stateInternals()));
        return new DoFnRunnerWithKeyedInternals<>(statefulDoFnRunner, keyedInternals);
    } else {
        return doFnRunnerWithMetrics;
    }
}
Also used : SamzaExecutionContext(org.apache.beam.runners.samza.SamzaExecutionContext) StepContext(org.apache.beam.runners.core.StepContext) TimerInternals(org.apache.beam.runners.core.TimerInternals) StateInternals(org.apache.beam.runners.core.StateInternals) StatefulDoFnRunner(org.apache.beam.runners.core.StatefulDoFnRunner) DoFnSignature(org.apache.beam.sdk.transforms.reflect.DoFnSignature)

Example 25 with DoFnSignature

use of org.apache.beam.sdk.transforms.reflect.DoFnSignature in project beam by apache.

the class TransformTranslator method parDo.

private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
    return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {

        @Override
        @SuppressWarnings("unchecked")
        public void evaluate(ParDo.MultiOutput<InputT, OutputT> transform, EvaluationContext context) {
            String stepName = context.getCurrentTransform().getFullName();
            DoFn<InputT, OutputT> doFn = transform.getFn();
            checkState(!DoFnSignatures.signatureForDoFn(doFn).processElement().isSplittable(), "Not expected to directly translate splittable DoFn, should have been overridden: %s", doFn);
            JavaRDD<WindowedValue<InputT>> inRDD = ((BoundedDataset<InputT>) context.borrowDataset(transform)).getRDD();
            WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
            MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();
            Coder<InputT> inputCoder = (Coder<InputT>) context.getInput(transform).getCoder();
            Map<TupleTag<?>, Coder<?>> outputCoders = context.getOutputCoders();
            JavaPairRDD<TupleTag<?>, WindowedValue<?>> all;
            DoFnSignature signature = DoFnSignatures.getSignature(transform.getFn().getClass());
            boolean stateful = signature.stateDeclarations().size() > 0 || signature.timerDeclarations().size() > 0;
            DoFnSchemaInformation doFnSchemaInformation;
            doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
            Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
            MultiDoFnFunction<InputT, OutputT> multiDoFnFunction = new MultiDoFnFunction<>(metricsAccum, stepName, doFn, context.getSerializableOptions(), transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), inputCoder, outputCoders, TranslationUtils.getSideInputs(transform.getSideInputs().values(), context), windowingStrategy, stateful, doFnSchemaInformation, sideInputMapping);
            if (stateful) {
                // Based on the fact that the signature is stateful, DoFnSignatures ensures
                // that it is also keyed
                all = statefulParDoTransform((KvCoder) context.getInput(transform).getCoder(), windowingStrategy.getWindowFn().windowCoder(), (JavaRDD) inRDD, getPartitioner(context), (MultiDoFnFunction) multiDoFnFunction, signature.processElement().requiresTimeSortedInput());
            } else {
                all = inRDD.mapPartitionsToPair(multiDoFnFunction);
            }
            Map<TupleTag<?>, PCollection<?>> outputs = context.getOutputs(transform);
            if (outputs.size() > 1) {
                StorageLevel level = StorageLevel.fromString(context.storageLevel());
                if (canAvoidRddSerialization(level)) {
                    // if it is memory only reduce the overhead of moving to bytes
                    all = all.persist(level);
                } else {
                    // Caching can cause Serialization, we need to code to bytes
                    // more details in https://issues.apache.org/jira/browse/BEAM-2669
                    Map<TupleTag<?>, Coder<WindowedValue<?>>> coderMap = TranslationUtils.getTupleTagCoders(outputs);
                    all = all.mapToPair(TranslationUtils.getTupleTagEncodeFunction(coderMap)).persist(level).mapToPair(TranslationUtils.getTupleTagDecodeFunction(coderMap));
                }
            }
            for (Map.Entry<TupleTag<?>, PCollection<?>> output : outputs.entrySet()) {
                JavaPairRDD<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
                // Object is the best we can do since different outputs can have different tags
                JavaRDD<WindowedValue<Object>> values = (JavaRDD<WindowedValue<Object>>) (JavaRDD<?>) filtered.values();
                context.putDataset(output.getValue(), new BoundedDataset<>(values));
            }
        }

        @Override
        public String toNativeString() {
            return "mapPartitions(new <fn>())";
        }
    };
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) WindowedValue(org.apache.beam.sdk.util.WindowedValue) StorageLevel(org.apache.spark.storage.StorageLevel) KvCoder(org.apache.beam.sdk.coders.KvCoder) Coder(org.apache.beam.sdk.coders.Coder) IterableCoder(org.apache.beam.sdk.coders.IterableCoder) ByteArrayCoder(org.apache.beam.sdk.coders.ByteArrayCoder) KvCoder(org.apache.beam.sdk.coders.KvCoder) MetricsContainerStepMapAccumulator(org.apache.beam.runners.spark.metrics.MetricsContainerStepMapAccumulator) JavaRDD(org.apache.spark.api.java.JavaRDD) PCollection(org.apache.beam.sdk.values.PCollection) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) ParDo(org.apache.beam.sdk.transforms.ParDo) SplittableParDo(org.apache.beam.runners.core.construction.SplittableParDo) Map(java.util.Map) HashMap(java.util.HashMap) DoFnSignature(org.apache.beam.sdk.transforms.reflect.DoFnSignature)

Aggregations

DoFnSignature (org.apache.beam.sdk.transforms.reflect.DoFnSignature)26 AppliedPTransform (org.apache.beam.sdk.runners.AppliedPTransform)8 PTransformMatcher (org.apache.beam.sdk.runners.PTransformMatcher)8 HashMap (java.util.HashMap)5 Map (java.util.Map)5 Coder (org.apache.beam.sdk.coders.Coder)4 StateSpec (org.apache.beam.sdk.state.StateSpec)4 PCollectionView (org.apache.beam.sdk.values.PCollectionView)4 KvCoder (org.apache.beam.sdk.coders.KvCoder)3 TupleTag (org.apache.beam.sdk.values.TupleTag)3 IOException (java.io.IOException)2 StatefulDoFnRunner (org.apache.beam.runners.core.StatefulDoFnRunner)2 SplittableParDo (org.apache.beam.runners.core.construction.SplittableParDo)2 SamzaExecutionContext (org.apache.beam.runners.samza.SamzaExecutionContext)2 SchemaCoder (org.apache.beam.sdk.schemas.SchemaCoder)2 DoFn (org.apache.beam.sdk.transforms.DoFn)2 DoFnSchemaInformation (org.apache.beam.sdk.transforms.DoFnSchemaInformation)2 ParDo (org.apache.beam.sdk.transforms.ParDo)2 BoundedWindow (org.apache.beam.sdk.transforms.windowing.BoundedWindow)2 WindowedValue (org.apache.beam.sdk.util.WindowedValue)2