Search in sources :

Example 76 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class ParDoTranslation method translateParDo.

public static ParDoPayload translateParDo(AppliedPTransform<?, ?, ParDo.MultiOutput<?, ?>> appliedPTransform, SdkComponents components) throws IOException {
    final ParDo.MultiOutput<?, ?> parDo = appliedPTransform.getTransform();
    final Pipeline pipeline = appliedPTransform.getPipeline();
    final DoFn<?, ?> doFn = parDo.getFn();
    // Get main input.
    Set<String> allInputs = appliedPTransform.getInputs().keySet().stream().map(TupleTag::getId).collect(Collectors.toSet());
    Set<String> sideInputs = parDo.getSideInputs().values().stream().map(s -> s.getTagInternal().getId()).collect(Collectors.toSet());
    String mainInputName = Iterables.getOnlyElement(Sets.difference(allInputs, sideInputs));
    PCollection<?> mainInput = (PCollection<?>) appliedPTransform.getInputs().get(new TupleTag<>(mainInputName));
    final DoFnSchemaInformation doFnSchemaInformation = ParDo.getDoFnSchemaInformation(doFn, mainInput);
    return translateParDo((ParDo.MultiOutput) parDo, mainInput, doFnSchemaInformation, pipeline, components);
}
Also used : StateSpec(org.apache.beam.sdk.state.StateSpec) StateDeclaration(org.apache.beam.sdk.transforms.reflect.DoFnSignature.StateDeclaration) DoFnSignatures.getTimerSpecOrThrow(org.apache.beam.sdk.transforms.reflect.DoFnSignatures.getTimerSpecOrThrow) Parameter(org.apache.beam.sdk.transforms.reflect.DoFnSignature.Parameter) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) DoFnSignatures(org.apache.beam.sdk.transforms.reflect.DoFnSignatures) Sets(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Sets) FunctionSpec(org.apache.beam.model.pipeline.v1.RunnerApi.FunctionSpec) Map(java.util.Map) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) Components(org.apache.beam.model.pipeline.v1.RunnerApi.Components) DoFnInvoker(org.apache.beam.sdk.transforms.reflect.DoFnInvoker) SPLITTABLE_PROCESS_ELEMENTS_URN(org.apache.beam.runners.core.construction.PTransformTranslation.SPLITTABLE_PROCESS_ELEMENTS_URN) SPLITTABLE_SPLIT_AND_SIZE_RESTRICTIONS_URN(org.apache.beam.runners.core.construction.PTransformTranslation.SPLITTABLE_SPLIT_AND_SIZE_RESTRICTIONS_URN) KvCoder(org.apache.beam.sdk.coders.KvCoder) PAR_DO_TRANSFORM_URN(org.apache.beam.runners.core.construction.PTransformTranslation.PAR_DO_TRANSFORM_URN) Set(java.util.Set) DoFnWithExecutionInformation(org.apache.beam.sdk.util.DoFnWithExecutionInformation) StandardUserStateTypes(org.apache.beam.model.pipeline.v1.RunnerApi.StandardUserStateTypes) Collectors(java.util.stream.Collectors) SPLITTABLE_PROCESS_SIZED_ELEMENTS_AND_RESTRICTIONS_URN(org.apache.beam.runners.core.construction.PTransformTranslation.SPLITTABLE_PROCESS_SIZED_ELEMENTS_AND_RESTRICTIONS_URN) TransformTranslator(org.apache.beam.runners.core.construction.PTransformTranslation.TransformTranslator) List(java.util.List) StandardRequirements(org.apache.beam.model.pipeline.v1.RunnerApi.StandardRequirements) ParDo(org.apache.beam.sdk.transforms.ParDo) SerializableUtils(org.apache.beam.sdk.util.SerializableUtils) VisibleForTesting(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.annotations.VisibleForTesting) AutoValue(com.google.auto.value.AutoValue) DoFnInvokers(org.apache.beam.sdk.transforms.reflect.DoFnInvokers) DoFnSignatures.getStateSpecOrThrow(org.apache.beam.sdk.transforms.reflect.DoFnSignatures.getStateSpecOrThrow) SideInput(org.apache.beam.model.pipeline.v1.RunnerApi.SideInput) Preconditions.checkNotNull(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkNotNull) Combine(org.apache.beam.sdk.transforms.Combine) Coder(org.apache.beam.sdk.coders.Coder) HashMap(java.util.HashMap) TupleTagList(org.apache.beam.sdk.values.TupleTagList) ArrayList(java.util.ArrayList) PTransform(org.apache.beam.sdk.transforms.PTransform) SPLITTABLE_PAIR_WITH_RESTRICTION_URN(org.apache.beam.runners.core.construction.PTransformTranslation.SPLITTABLE_PAIR_WITH_RESTRICTION_URN) WindowMappingFn(org.apache.beam.sdk.transforms.windowing.WindowMappingFn) ParDoPayload(org.apache.beam.model.pipeline.v1.RunnerApi.ParDoPayload) MultiOutput(org.apache.beam.sdk.transforms.ParDo.MultiOutput) TimerSpec(org.apache.beam.sdk.state.TimerSpec) ViewFn(org.apache.beam.sdk.transforms.ViewFn) TupleTag(org.apache.beam.sdk.values.TupleTag) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) Pipeline(org.apache.beam.sdk.Pipeline) Nullable(javax.annotation.Nullable) AppliedPTransform(org.apache.beam.sdk.runners.AppliedPTransform) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) RunnerApi(org.apache.beam.model.pipeline.v1.RunnerApi) DoFn(org.apache.beam.sdk.transforms.DoFn) DoFnSignature(org.apache.beam.sdk.transforms.reflect.DoFnSignature) IOException(java.io.IOException) PCollection(org.apache.beam.sdk.values.PCollection) TransformPayloadTranslator(org.apache.beam.runners.core.construction.PTransformTranslation.TransformPayloadTranslator) StateSpecs(org.apache.beam.sdk.state.StateSpecs) TimerDeclaration(org.apache.beam.sdk.transforms.reflect.DoFnSignature.TimerDeclaration) Preconditions.checkState(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkState) PCollectionView(org.apache.beam.sdk.values.PCollectionView) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) SPLITTABLE_TRUNCATE_SIZED_RESTRICTION_URN(org.apache.beam.runners.core.construction.PTransformTranslation.SPLITTABLE_TRUNCATE_SIZED_RESTRICTION_URN) Collections(java.util.Collections) TimeDomain(org.apache.beam.sdk.state.TimeDomain) BeamUrns.getUrn(org.apache.beam.runners.core.construction.BeamUrns.getUrn) PCollection(org.apache.beam.sdk.values.PCollection) DoFnSchemaInformation(org.apache.beam.sdk.transforms.DoFnSchemaInformation) ParDo(org.apache.beam.sdk.transforms.ParDo) TupleTag(org.apache.beam.sdk.values.TupleTag) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) Pipeline(org.apache.beam.sdk.Pipeline)

Example 77 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class ReplacementOutputs method tagged.

public static Map<PCollection<?>, ReplacementOutput> tagged(Map<TupleTag<?>, PCollection<?>> original, POutput replacement) {
    Map<TupleTag<?>, TaggedPValue> originalTags = new HashMap<>();
    for (Map.Entry<TupleTag<?>, PCollection<?>> originalValue : original.entrySet()) {
        originalTags.put(originalValue.getKey(), TaggedPValue.of(originalValue.getKey(), originalValue.getValue()));
    }
    ImmutableMap.Builder<PCollection<?>, ReplacementOutput> resultBuilder = ImmutableMap.builder();
    Map<TupleTag<?>, PCollection<?>> remainingTaggedOriginals = new HashMap<>(original);
    Map<TupleTag<?>, PCollection<?>> taggedReplacements = PValues.expandOutput(replacement);
    for (Map.Entry<TupleTag<?>, PCollection<?>> replacementValue : taggedReplacements.entrySet()) {
        TaggedPValue mapped = originalTags.get(replacementValue.getKey());
        checkArgument(mapped != null, "Missing original output for Tag %s and Value %s Between original %s and replacement %s", replacementValue.getKey(), replacementValue.getValue(), original, replacement.expand());
        resultBuilder.put(replacementValue.getValue(), ReplacementOutput.of(mapped, TaggedPValue.of(replacementValue.getKey(), (PCollection<?>) replacementValue.getValue())));
        remainingTaggedOriginals.remove(replacementValue.getKey());
    }
    checkArgument(remainingTaggedOriginals.isEmpty(), "Missing replacement for tagged values %s. Replacement was: %s", remainingTaggedOriginals, taggedReplacements);
    return resultBuilder.build();
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) ReplacementOutput(org.apache.beam.sdk.runners.PTransformOverrideFactory.ReplacementOutput) HashMap(java.util.HashMap) TaggedPValue(org.apache.beam.sdk.values.TaggedPValue) TupleTag(org.apache.beam.sdk.values.TupleTag) HashMap(java.util.HashMap) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) Map(java.util.Map) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)

Example 78 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class ParDoEvaluatorTest method createEvaluator.

private ParDoEvaluator<Integer> createEvaluator(PCollectionView<Integer> singletonView, RecorderFn fn, PCollection<Integer> input, PCollection<Integer> output) {
    when(evaluationContext.createSideInputReader(ImmutableList.of(singletonView))).thenReturn(new ReadyInGlobalWindowReader());
    DirectExecutionContext executionContext = mock(DirectExecutionContext.class);
    DirectStepContext stepContext = mock(DirectStepContext.class);
    when(executionContext.getStepContext(Mockito.any(String.class))).thenReturn(stepContext);
    when(stepContext.getTimerUpdate()).thenReturn(TimerUpdate.empty());
    when(evaluationContext.getExecutionContext(Mockito.any(AppliedPTransform.class), Mockito.any(StructuralKey.class))).thenReturn(executionContext);
    DirectGraphs.performDirectOverrides(p);
    @SuppressWarnings("unchecked") AppliedPTransform<PCollection<Integer>, ?, ?> transform = (AppliedPTransform<PCollection<Integer>, ?, ?>) DirectGraphs.getProducer(output);
    return ParDoEvaluator.create(evaluationContext, PipelineOptionsFactory.create(), stepContext, transform, input.getCoder(), input.getWindowingStrategy(), fn, null, /* key */
    ImmutableList.of(singletonView), mainOutputTag, additionalOutputTags, ImmutableMap.of(mainOutputTag, output), DoFnSchemaInformation.create(), Collections.emptyMap(), ParDoEvaluator.defaultRunnerFactory());
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) AppliedPTransform(org.apache.beam.sdk.runners.AppliedPTransform) DirectStepContext(org.apache.beam.runners.direct.DirectExecutionContext.DirectStepContext) StructuralKey(org.apache.beam.runners.local.StructuralKey)

Example 79 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class StatefulParDoEvaluatorFactoryTest method testUnprocessedElements.

/**
 * A test that explicitly delays a side input so that the main input will have to be reprocessed,
 * testing that {@code finishBundle()} re-assembles the GBK outputs correctly.
 */
@Test
public void testUnprocessedElements() throws Exception {
    // To test the factory, first we set up a pipeline and then we use the constructed
    // pipeline to create the right parameters to pass to the factory
    final String stateId = "my-state-id";
    // For consistency, window it into FixedWindows. Actually we will fabricate an input bundle.
    PCollection<KV<String, Integer>> mainInput = pipeline.apply(Create.of(KV.of("hello", 1), KV.of("hello", 2))).apply(Window.into(FixedWindows.of(Duration.millis(10))));
    final PCollectionView<List<Integer>> sideInput = pipeline.apply("Create side input", Create.of(42)).apply("Window side input", Window.into(FixedWindows.of(Duration.millis(10)))).apply("View side input", View.asList());
    TupleTag<Integer> mainOutput = new TupleTag<>();
    PCollection<Integer> produced = mainInput.apply(new ParDoMultiOverrideFactory.GbkThenStatefulParDo<>(new DoFn<KV<String, Integer>, Integer>() {

        @StateId(stateId)
        private final StateSpec<ValueState<String>> spec = StateSpecs.value(StringUtf8Coder.of());

        @ProcessElement
        public void process(ProcessContext c) {
        }
    }, mainOutput, TupleTagList.empty(), Collections.singletonList(sideInput), DoFnSchemaInformation.create(), Collections.emptyMap())).get(mainOutput).setCoder(VarIntCoder.of());
    StatefulParDoEvaluatorFactory<String, Integer, Integer> factory = new StatefulParDoEvaluatorFactory<>(mockEvaluationContext, options);
    // This will be the stateful ParDo from the expansion
    AppliedPTransform<PCollection<KeyedWorkItem<String, KV<String, Integer>>>, PCollectionTuple, StatefulParDo<String, Integer, Integer>> producingTransform = (AppliedPTransform) DirectGraphs.getProducer(produced);
    // Then there will be a digging down to the step context to get the state internals
    when(mockEvaluationContext.getExecutionContext(eq(producingTransform), Mockito.<StructuralKey>any())).thenReturn(mockExecutionContext);
    when(mockExecutionContext.getStepContext(any())).thenReturn(mockStepContext);
    when(mockEvaluationContext.createBundle(Matchers.<PCollection<Integer>>any())).thenReturn(mockUncommittedBundle);
    when(mockStepContext.getTimerUpdate()).thenReturn(TimerUpdate.empty());
    // And digging to check whether the window is ready
    when(mockEvaluationContext.createSideInputReader(anyList())).thenReturn(mockSideInputReader);
    when(mockSideInputReader.isReady(Matchers.any(), Matchers.any())).thenReturn(false);
    IntervalWindow firstWindow = new IntervalWindow(new Instant(0), new Instant(9));
    // A single bundle with some elements in the global window; it should register cleanup for the
    // global window state merely by having the evaluator created. The cleanup logic does not
    // depend on the window.
    String key = "hello";
    WindowedValue<KV<String, Integer>> firstKv = WindowedValue.of(KV.of(key, 1), new Instant(3), firstWindow, PaneInfo.NO_FIRING);
    WindowedValue<KeyedWorkItem<String, KV<String, Integer>>> gbkOutputElement = firstKv.withValue(KeyedWorkItems.elementsWorkItem("hello", ImmutableList.of(firstKv, firstKv.withValue(KV.of(key, 13)), firstKv.withValue(KV.of(key, 15)))));
    CommittedBundle<KeyedWorkItem<String, KV<String, Integer>>> inputBundle = BUNDLE_FACTORY.createBundle((PCollection<KeyedWorkItem<String, KV<String, Integer>>>) Iterables.getOnlyElement(TransformInputs.nonAdditionalInputs(producingTransform))).add(gbkOutputElement).commit(Instant.now());
    TransformEvaluator<KeyedWorkItem<String, KV<String, Integer>>> evaluator = factory.forApplication(producingTransform, inputBundle);
    evaluator.processElement(gbkOutputElement);
    // This should push back every element as a KV<String, Iterable<Integer>>
    // in the appropriate window. Since the keys are equal they are single-threaded
    TransformResult<KeyedWorkItem<String, KV<String, Integer>>> result = evaluator.finishBundle();
    List<Integer> pushedBackInts = new ArrayList<>();
    for (WindowedValue<? extends KeyedWorkItem<String, KV<String, Integer>>> unprocessedElement : result.getUnprocessedElements()) {
        assertThat(Iterables.getOnlyElement(unprocessedElement.getWindows()), equalTo((BoundedWindow) firstWindow));
        assertThat(unprocessedElement.getValue().key(), equalTo("hello"));
        for (WindowedValue<KV<String, Integer>> windowedKv : unprocessedElement.getValue().elementsIterable()) {
            pushedBackInts.add(windowedKv.getValue().getValue());
        }
    }
    assertThat(pushedBackInts, containsInAnyOrder(1, 13, 15));
}
Also used : ArrayList(java.util.ArrayList) TupleTag(org.apache.beam.sdk.values.TupleTag) StateSpec(org.apache.beam.sdk.state.StateSpec) AppliedPTransform(org.apache.beam.sdk.runners.AppliedPTransform) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) List(java.util.List) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) TupleTagList(org.apache.beam.sdk.values.TupleTagList) ArrayList(java.util.ArrayList) Matchers.anyList(org.mockito.Matchers.anyList) IntervalWindow(org.apache.beam.sdk.transforms.windowing.IntervalWindow) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) KeyedWorkItem(org.apache.beam.runners.core.KeyedWorkItem) PCollection(org.apache.beam.sdk.values.PCollection) DoFn(org.apache.beam.sdk.transforms.DoFn) StatefulParDo(org.apache.beam.runners.direct.ParDoMultiOverrideFactory.StatefulParDo) Test(org.junit.Test)

Example 80 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class UnboundedReadEvaluatorFactoryTest method processElement.

private void processElement(final TestUnboundedSource<String> source) throws Exception {
    final EvaluationContext context = EvaluationContext.create(MockClock.fromInstant(Instant.now()), CloningBundleFactory.create(), DirectGraph.create(emptyMap(), emptyMap(), LinkedListMultimap.create(), emptySet(), emptyMap()), emptySet(), Executors.newCachedThreadPool());
    final UnboundedReadEvaluatorFactory factory = new UnboundedReadEvaluatorFactory(context, p.getOptions());
    final SplittableParDo.PrimitiveUnboundedRead<String> unbounded = new SplittableParDo.PrimitiveUnboundedRead(Read.from(source));
    final Pipeline pipeline = Pipeline.create(p.getOptions());
    final PCollection<String> pCollection = pipeline.apply(unbounded);
    final AppliedPTransform<PBegin, PCollection<String>, SplittableParDo.PrimitiveUnboundedRead<String>> application = AppliedPTransform.of("test", new HashMap<>(), singletonMap(new TupleTag(), pCollection), unbounded, ResourceHints.create(), pipeline);
    final TransformEvaluator<UnboundedSourceShard<String, TestCheckpointMark>> evaluator = factory.forApplication(application, null);
    final UnboundedSource.UnboundedReader<String> reader = source.createReader(p.getOptions(), null);
    final UnboundedSourceShard<String, TestCheckpointMark> shard = UnboundedSourceShard.of(source, new NeverDeduplicator(), reader, null);
    final WindowedValue<UnboundedSourceShard<String, TestCheckpointMark>> value = WindowedValue.of(shard, BoundedWindow.TIMESTAMP_MAX_VALUE, GlobalWindow.INSTANCE, PaneInfo.NO_FIRING);
    TestUnboundedSource.readerClosedCount = 0;
    evaluator.processElement(value);
}
Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) PBegin(org.apache.beam.sdk.values.PBegin) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) SplittableParDo(org.apache.beam.runners.core.construction.SplittableParDo) PCollection(org.apache.beam.sdk.values.PCollection) UnboundedSourceShard(org.apache.beam.runners.direct.UnboundedReadEvaluatorFactory.UnboundedSourceShard) NeverDeduplicator(org.apache.beam.runners.direct.UnboundedReadDeduplicator.NeverDeduplicator) UnboundedSource(org.apache.beam.sdk.io.UnboundedSource)

Aggregations

PCollection (org.apache.beam.sdk.values.PCollection)199 Test (org.junit.Test)133 KV (org.apache.beam.sdk.values.KV)62 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)61 Map (java.util.Map)59 List (java.util.List)58 Rule (org.junit.Rule)57 RunWith (org.junit.runner.RunWith)54 PAssert (org.apache.beam.sdk.testing.PAssert)52 Instant (org.joda.time.Instant)46 Duration (org.joda.time.Duration)45 JUnit4 (org.junit.runners.JUnit4)45 ParDo (org.apache.beam.sdk.transforms.ParDo)44 TupleTag (org.apache.beam.sdk.values.TupleTag)42 Pipeline (org.apache.beam.sdk.Pipeline)41 Create (org.apache.beam.sdk.transforms.Create)41 ArrayList (java.util.ArrayList)40 Serializable (java.io.Serializable)39 PTransform (org.apache.beam.sdk.transforms.PTransform)37 Row (org.apache.beam.sdk.values.Row)37