Search in sources :

Example 11 with PValue

use of org.apache.beam.sdk.values.PValue in project beam by apache.

the class StreamingTransformTranslator method parDo.

private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
    return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {

        public void evaluate(final ParDo.MultiOutput<InputT, OutputT> transform, final EvaluationContext context) {
            final DoFn<InputT, OutputT> doFn = transform.getFn();
            rejectSplittable(doFn);
            rejectStateAndTimers(doFn);
            final SparkRuntimeContext runtimeContext = context.getRuntimeContext();
            final SparkPCollectionView pviews = context.getPViews();
            final WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
            @SuppressWarnings("unchecked") UnboundedDataset<InputT> unboundedDataset = ((UnboundedDataset<InputT>) context.borrowDataset(transform));
            JavaDStream<WindowedValue<InputT>> dStream = unboundedDataset.getDStream();
            final String stepName = context.getCurrentTransform().getFullName();
            JavaPairDStream<TupleTag<?>, WindowedValue<?>> all = dStream.transformToPair(new Function<JavaRDD<WindowedValue<InputT>>, JavaPairRDD<TupleTag<?>, WindowedValue<?>>>() {

                @Override
                public JavaPairRDD<TupleTag<?>, WindowedValue<?>> call(JavaRDD<WindowedValue<InputT>> rdd) throws Exception {
                    final Accumulator<NamedAggregators> aggAccum = AggregatorsAccumulator.getInstance();
                    final Accumulator<MetricsContainerStepMap> metricsAccum = MetricsAccumulator.getInstance();
                    final Map<TupleTag<?>, KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>>> sideInputs = TranslationUtils.getSideInputs(transform.getSideInputs(), JavaSparkContext.fromSparkContext(rdd.context()), pviews);
                    return rdd.mapPartitionsToPair(new MultiDoFnFunction<>(aggAccum, metricsAccum, stepName, doFn, runtimeContext, transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), sideInputs, windowingStrategy, false));
                }
            });
            Map<TupleTag<?>, PValue> outputs = context.getOutputs(transform);
            if (outputs.size() > 1) {
                // cache the DStream if we're going to filter it more than once.
                all.cache();
            }
            for (Map.Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
                @SuppressWarnings("unchecked") JavaPairDStream<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
                @SuppressWarnings("unchecked") JavaDStream<WindowedValue<Object>> // Object is the best we can do since different outputs can have different tags
                values = (JavaDStream<WindowedValue<Object>>) (JavaDStream<?>) TranslationUtils.dStreamValues(filtered);
                context.putDataset(output.getValue(), new UnboundedDataset<>(values, unboundedDataset.getStreamSources()));
            }
        }

        @Override
        public String toNativeString() {
            return "mapPartitions(new <fn>())";
        }
    };
}
Also used : MetricsAccumulator(org.apache.beam.runners.spark.metrics.MetricsAccumulator) AggregatorsAccumulator(org.apache.beam.runners.spark.aggregators.AggregatorsAccumulator) Accumulator(org.apache.spark.Accumulator) TupleTag(org.apache.beam.sdk.values.TupleTag) JavaDStream(org.apache.spark.streaming.api.java.JavaDStream) WindowingStrategy(org.apache.beam.sdk.values.WindowingStrategy) WindowedValue(org.apache.beam.sdk.util.WindowedValue) SparkRuntimeContext(org.apache.beam.runners.spark.translation.SparkRuntimeContext) JavaPairRDD(org.apache.spark.api.java.JavaPairRDD) SideInputBroadcast(org.apache.beam.runners.spark.util.SideInputBroadcast) MultiDoFnFunction(org.apache.beam.runners.spark.translation.MultiDoFnFunction) PValue(org.apache.beam.sdk.values.PValue) TransformEvaluator(org.apache.beam.runners.spark.translation.TransformEvaluator) JavaRDD(org.apache.spark.api.java.JavaRDD) TranslationUtils(org.apache.beam.runners.spark.translation.TranslationUtils) ParDo(org.apache.beam.sdk.transforms.ParDo) EvaluationContext(org.apache.beam.runners.spark.translation.EvaluationContext) MetricsContainerStepMap(org.apache.beam.runners.core.metrics.MetricsContainerStepMap) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap) SparkPCollectionView(org.apache.beam.runners.spark.translation.SparkPCollectionView)

Example 12 with PValue

use of org.apache.beam.sdk.values.PValue in project beam by apache.

the class TransformTranslator method flattenPColl.

private static <T> TransformEvaluator<Flatten.PCollections<T>> flattenPColl() {
    return new TransformEvaluator<Flatten.PCollections<T>>() {

        @SuppressWarnings("unchecked")
        @Override
        public void evaluate(Flatten.PCollections<T> transform, EvaluationContext context) {
            Collection<PValue> pcs = context.getInputs(transform).values();
            JavaRDD<WindowedValue<T>> unionRDD;
            if (pcs.size() == 0) {
                unionRDD = context.getSparkContext().emptyRDD();
            } else {
                JavaRDD<WindowedValue<T>>[] rdds = new JavaRDD[pcs.size()];
                int index = 0;
                for (PValue pc : pcs) {
                    checkArgument(pc instanceof PCollection, "Flatten had non-PCollection value in input: %s of type %s", pc, pc.getClass().getSimpleName());
                    rdds[index] = ((BoundedDataset<T>) context.borrowDataset(pc)).getRDD();
                    index++;
                }
                unionRDD = context.getSparkContext().union(rdds);
            }
            context.putDataset(transform, new BoundedDataset<>(unionRDD));
        }

        @Override
        public String toNativeString() {
            return "sparkContext.union(...)";
        }
    };
}
Also used : Flatten(org.apache.beam.sdk.transforms.Flatten) PValue(org.apache.beam.sdk.values.PValue) JavaRDD(org.apache.spark.api.java.JavaRDD) PCollection(org.apache.beam.sdk.values.PCollection) WindowedValue(org.apache.beam.sdk.util.WindowedValue)

Example 13 with PValue

use of org.apache.beam.sdk.values.PValue in project beam by apache.

the class ParDoTranslator method translate.

@Override
public void translate(ParDo.MultiOutput<InputT, OutputT> transform, TranslationContext context) {
    DoFn<InputT, OutputT> doFn = transform.getFn();
    DoFnSignature signature = DoFnSignatures.getSignature(doFn.getClass());
    if (signature.processElement().isSplittable()) {
        throw new UnsupportedOperationException(String.format("%s does not support splittable DoFn: %s", ApexRunner.class.getSimpleName(), doFn));
    }
    if (signature.stateDeclarations().size() > 0) {
        throw new UnsupportedOperationException(String.format("Found %s annotations on %s, but %s cannot yet be used with state in the %s.", DoFn.StateId.class.getSimpleName(), doFn.getClass().getName(), DoFn.class.getSimpleName(), ApexRunner.class.getSimpleName()));
    }
    if (signature.timerDeclarations().size() > 0) {
        throw new UnsupportedOperationException(String.format("Found %s annotations on %s, but %s cannot yet be used with timers in the %s.", DoFn.TimerId.class.getSimpleName(), doFn.getClass().getName(), DoFn.class.getSimpleName(), ApexRunner.class.getSimpleName()));
    }
    Map<TupleTag<?>, PValue> outputs = context.getOutputs();
    PCollection<InputT> input = context.getInput();
    List<PCollectionView<?>> sideInputs = transform.getSideInputs();
    Coder<InputT> inputCoder = input.getCoder();
    WindowedValueCoder<InputT> wvInputCoder = FullWindowedValueCoder.of(inputCoder, input.getWindowingStrategy().getWindowFn().windowCoder());
    ApexParDoOperator<InputT, OutputT> operator = new ApexParDoOperator<>(context.getPipelineOptions(), doFn, transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), input.getWindowingStrategy(), sideInputs, wvInputCoder, context.getStateBackend());
    Map<PCollection<?>, OutputPort<?>> ports = Maps.newHashMapWithExpectedSize(outputs.size());
    for (Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
        checkArgument(output.getValue() instanceof PCollection, "%s %s outputs non-PCollection %s of type %s", ParDo.MultiOutput.class.getSimpleName(), context.getFullName(), output.getValue(), output.getValue().getClass().getSimpleName());
        PCollection<?> pc = (PCollection<?>) output.getValue();
        if (output.getKey().equals(transform.getMainOutputTag())) {
            ports.put(pc, operator.output);
        } else {
            int portIndex = 0;
            for (TupleTag<?> tag : transform.getAdditionalOutputTags().getAll()) {
                if (tag.equals(output.getKey())) {
                    ports.put(pc, operator.additionalOutputPorts[portIndex]);
                    break;
                }
                portIndex++;
            }
        }
    }
    context.addOperator(operator, ports);
    context.addStream(context.getInput(), operator.input);
    if (!sideInputs.isEmpty()) {
        addSideInputs(operator.sideInput1, sideInputs, context);
    }
}
Also used : OutputPort(com.datatorrent.api.Operator.OutputPort) TupleTag(org.apache.beam.sdk.values.TupleTag) ApexParDoOperator(org.apache.beam.runners.apex.translation.operators.ApexParDoOperator) PValue(org.apache.beam.sdk.values.PValue) PCollection(org.apache.beam.sdk.values.PCollection) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFn(org.apache.beam.sdk.transforms.DoFn) DoFnSignature(org.apache.beam.sdk.transforms.reflect.DoFnSignature)

Example 14 with PValue

use of org.apache.beam.sdk.values.PValue in project beam by apache.

the class PTransformTranslation method toProto.

/**
   * Translates an {@link AppliedPTransform} into a runner API proto.
   *
   * <p>Does not register the {@code appliedPTransform} within the provided {@link SdkComponents}.
   */
static RunnerApi.PTransform toProto(AppliedPTransform<?, ?, ?> appliedPTransform, List<AppliedPTransform<?, ?, ?>> subtransforms, SdkComponents components) throws IOException {
    RunnerApi.PTransform.Builder transformBuilder = RunnerApi.PTransform.newBuilder();
    for (Map.Entry<TupleTag<?>, PValue> taggedInput : appliedPTransform.getInputs().entrySet()) {
        checkArgument(taggedInput.getValue() instanceof PCollection, "Unexpected input type %s", taggedInput.getValue().getClass());
        transformBuilder.putInputs(toProto(taggedInput.getKey()), components.registerPCollection((PCollection<?>) taggedInput.getValue()));
    }
    for (Map.Entry<TupleTag<?>, PValue> taggedOutput : appliedPTransform.getOutputs().entrySet()) {
        // TODO: Remove gating
        if (taggedOutput.getValue() instanceof PCollection) {
            checkArgument(taggedOutput.getValue() instanceof PCollection, "Unexpected output type %s", taggedOutput.getValue().getClass());
            transformBuilder.putOutputs(toProto(taggedOutput.getKey()), components.registerPCollection((PCollection<?>) taggedOutput.getValue()));
        }
    }
    for (AppliedPTransform<?, ?, ?> subtransform : subtransforms) {
        transformBuilder.addSubtransforms(components.getExistingPTransformId(subtransform));
    }
    transformBuilder.setUniqueName(appliedPTransform.getFullName());
    // TODO: Display Data
    PTransform<?, ?> transform = appliedPTransform.getTransform();
    if (KNOWN_PAYLOAD_TRANSLATORS.containsKey(transform.getClass())) {
        FunctionSpec payload = KNOWN_PAYLOAD_TRANSLATORS.get(transform.getClass()).translate(appliedPTransform, components);
        transformBuilder.setSpec(payload);
    }
    return transformBuilder.build();
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) FunctionSpec(org.apache.beam.sdk.common.runner.v1.RunnerApi.FunctionSpec) TupleTag(org.apache.beam.sdk.values.TupleTag) PValue(org.apache.beam.sdk.values.PValue) ImmutableMap(com.google.common.collect.ImmutableMap) Map(java.util.Map) PTransform(org.apache.beam.sdk.transforms.PTransform) AppliedPTransform(org.apache.beam.sdk.runners.AppliedPTransform)

Example 15 with PValue

use of org.apache.beam.sdk.values.PValue in project beam by apache.

the class PTransformMatchersTest method flattenWithDuplicateInputsWithDuplicates.

@Test
public void flattenWithDuplicateInputsWithDuplicates() {
    PCollection<Object> duplicate = PCollection.createPrimitiveOutputInternal(p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED);
    AppliedPTransform application = AppliedPTransform.<PCollectionList<Object>, PCollection<Object>, Flatten.PCollections<Object>>of("Flatten", ImmutableMap.<TupleTag<?>, PValue>builder().put(new TupleTag<Object>(), duplicate).put(new TupleTag<Object>(), duplicate).build(), Collections.<TupleTag<?>, PValue>singletonMap(new TupleTag<Object>(), PCollection.createPrimitiveOutputInternal(p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED)), Flatten.pCollections(), p);
    assertThat(PTransformMatchers.flattenWithDuplicateInputs().matches(application), is(true));
}
Also used : PCollectionList(org.apache.beam.sdk.values.PCollectionList) PCollection(org.apache.beam.sdk.values.PCollection) AppliedPTransform(org.apache.beam.sdk.runners.AppliedPTransform) TupleTag(org.apache.beam.sdk.values.TupleTag) PValue(org.apache.beam.sdk.values.PValue) Test(org.junit.Test)

Aggregations

PValue (org.apache.beam.sdk.values.PValue)28 TupleTag (org.apache.beam.sdk.values.TupleTag)13 PCollection (org.apache.beam.sdk.values.PCollection)12 Test (org.junit.Test)9 TaggedPValue (org.apache.beam.sdk.values.TaggedPValue)7 HashSet (java.util.HashSet)5 Map (java.util.Map)5 Node (org.apache.beam.sdk.runners.TransformHierarchy.Node)5 WindowedValue (org.apache.beam.sdk.util.WindowedValue)5 ImmutableMap (com.google.common.collect.ImmutableMap)4 ReplacementOutput (org.apache.beam.sdk.runners.PTransformOverrideFactory.ReplacementOutput)4 PTransform (org.apache.beam.sdk.transforms.PTransform)4 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)4 JavaRDD (org.apache.spark.api.java.JavaRDD)4 DoFn (org.apache.beam.sdk.transforms.DoFn)3 ParDo (org.apache.beam.sdk.transforms.ParDo)3 ImmutableList (com.google.common.collect.ImmutableList)2 HashMap (java.util.HashMap)2 MetricsContainerStepMap (org.apache.beam.runners.core.metrics.MetricsContainerStepMap)2 EvaluationContext (org.apache.beam.runners.spark.translation.EvaluationContext)2