use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class ParDoTranslation method translateParDo.
public static ParDoPayload translateParDo(AppliedPTransform<?, ?, ParDo.MultiOutput<?, ?>> appliedPTransform, SdkComponents components) throws IOException {
final ParDo.MultiOutput<?, ?> parDo = appliedPTransform.getTransform();
final Pipeline pipeline = appliedPTransform.getPipeline();
final DoFn<?, ?> doFn = parDo.getFn();
// Get main input.
Set<String> allInputs = appliedPTransform.getInputs().keySet().stream().map(TupleTag::getId).collect(Collectors.toSet());
Set<String> sideInputs = parDo.getSideInputs().values().stream().map(s -> s.getTagInternal().getId()).collect(Collectors.toSet());
String mainInputName = Iterables.getOnlyElement(Sets.difference(allInputs, sideInputs));
PCollection<?> mainInput = (PCollection<?>) appliedPTransform.getInputs().get(new TupleTag<>(mainInputName));
final DoFnSchemaInformation doFnSchemaInformation = ParDo.getDoFnSchemaInformation(doFn, mainInput);
return translateParDo((ParDo.MultiOutput) parDo, mainInput, doFnSchemaInformation, pipeline, components);
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class ReplacementOutputs method tagged.
public static Map<PCollection<?>, ReplacementOutput> tagged(Map<TupleTag<?>, PCollection<?>> original, POutput replacement) {
Map<TupleTag<?>, TaggedPValue> originalTags = new HashMap<>();
for (Map.Entry<TupleTag<?>, PCollection<?>> originalValue : original.entrySet()) {
originalTags.put(originalValue.getKey(), TaggedPValue.of(originalValue.getKey(), originalValue.getValue()));
}
ImmutableMap.Builder<PCollection<?>, ReplacementOutput> resultBuilder = ImmutableMap.builder();
Map<TupleTag<?>, PCollection<?>> remainingTaggedOriginals = new HashMap<>(original);
Map<TupleTag<?>, PCollection<?>> taggedReplacements = PValues.expandOutput(replacement);
for (Map.Entry<TupleTag<?>, PCollection<?>> replacementValue : taggedReplacements.entrySet()) {
TaggedPValue mapped = originalTags.get(replacementValue.getKey());
checkArgument(mapped != null, "Missing original output for Tag %s and Value %s Between original %s and replacement %s", replacementValue.getKey(), replacementValue.getValue(), original, replacement.expand());
resultBuilder.put(replacementValue.getValue(), ReplacementOutput.of(mapped, TaggedPValue.of(replacementValue.getKey(), (PCollection<?>) replacementValue.getValue())));
remainingTaggedOriginals.remove(replacementValue.getKey());
}
checkArgument(remainingTaggedOriginals.isEmpty(), "Missing replacement for tagged values %s. Replacement was: %s", remainingTaggedOriginals, taggedReplacements);
return resultBuilder.build();
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class ParDoEvaluatorTest method createEvaluator.
private ParDoEvaluator<Integer> createEvaluator(PCollectionView<Integer> singletonView, RecorderFn fn, PCollection<Integer> input, PCollection<Integer> output) {
when(evaluationContext.createSideInputReader(ImmutableList.of(singletonView))).thenReturn(new ReadyInGlobalWindowReader());
DirectExecutionContext executionContext = mock(DirectExecutionContext.class);
DirectStepContext stepContext = mock(DirectStepContext.class);
when(executionContext.getStepContext(Mockito.any(String.class))).thenReturn(stepContext);
when(stepContext.getTimerUpdate()).thenReturn(TimerUpdate.empty());
when(evaluationContext.getExecutionContext(Mockito.any(AppliedPTransform.class), Mockito.any(StructuralKey.class))).thenReturn(executionContext);
DirectGraphs.performDirectOverrides(p);
@SuppressWarnings("unchecked") AppliedPTransform<PCollection<Integer>, ?, ?> transform = (AppliedPTransform<PCollection<Integer>, ?, ?>) DirectGraphs.getProducer(output);
return ParDoEvaluator.create(evaluationContext, PipelineOptionsFactory.create(), stepContext, transform, input.getCoder(), input.getWindowingStrategy(), fn, null, /* key */
ImmutableList.of(singletonView), mainOutputTag, additionalOutputTags, ImmutableMap.of(mainOutputTag, output), DoFnSchemaInformation.create(), Collections.emptyMap(), ParDoEvaluator.defaultRunnerFactory());
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class StatefulParDoEvaluatorFactoryTest method testUnprocessedElements.
/**
* A test that explicitly delays a side input so that the main input will have to be reprocessed,
* testing that {@code finishBundle()} re-assembles the GBK outputs correctly.
*/
@Test
public void testUnprocessedElements() throws Exception {
// To test the factory, first we set up a pipeline and then we use the constructed
// pipeline to create the right parameters to pass to the factory
final String stateId = "my-state-id";
// For consistency, window it into FixedWindows. Actually we will fabricate an input bundle.
PCollection<KV<String, Integer>> mainInput = pipeline.apply(Create.of(KV.of("hello", 1), KV.of("hello", 2))).apply(Window.into(FixedWindows.of(Duration.millis(10))));
final PCollectionView<List<Integer>> sideInput = pipeline.apply("Create side input", Create.of(42)).apply("Window side input", Window.into(FixedWindows.of(Duration.millis(10)))).apply("View side input", View.asList());
TupleTag<Integer> mainOutput = new TupleTag<>();
PCollection<Integer> produced = mainInput.apply(new ParDoMultiOverrideFactory.GbkThenStatefulParDo<>(new DoFn<KV<String, Integer>, Integer>() {
@StateId(stateId)
private final StateSpec<ValueState<String>> spec = StateSpecs.value(StringUtf8Coder.of());
@ProcessElement
public void process(ProcessContext c) {
}
}, mainOutput, TupleTagList.empty(), Collections.singletonList(sideInput), DoFnSchemaInformation.create(), Collections.emptyMap())).get(mainOutput).setCoder(VarIntCoder.of());
StatefulParDoEvaluatorFactory<String, Integer, Integer> factory = new StatefulParDoEvaluatorFactory<>(mockEvaluationContext, options);
// This will be the stateful ParDo from the expansion
AppliedPTransform<PCollection<KeyedWorkItem<String, KV<String, Integer>>>, PCollectionTuple, StatefulParDo<String, Integer, Integer>> producingTransform = (AppliedPTransform) DirectGraphs.getProducer(produced);
// Then there will be a digging down to the step context to get the state internals
when(mockEvaluationContext.getExecutionContext(eq(producingTransform), Mockito.<StructuralKey>any())).thenReturn(mockExecutionContext);
when(mockExecutionContext.getStepContext(any())).thenReturn(mockStepContext);
when(mockEvaluationContext.createBundle(Matchers.<PCollection<Integer>>any())).thenReturn(mockUncommittedBundle);
when(mockStepContext.getTimerUpdate()).thenReturn(TimerUpdate.empty());
// And digging to check whether the window is ready
when(mockEvaluationContext.createSideInputReader(anyList())).thenReturn(mockSideInputReader);
when(mockSideInputReader.isReady(Matchers.any(), Matchers.any())).thenReturn(false);
IntervalWindow firstWindow = new IntervalWindow(new Instant(0), new Instant(9));
// A single bundle with some elements in the global window; it should register cleanup for the
// global window state merely by having the evaluator created. The cleanup logic does not
// depend on the window.
String key = "hello";
WindowedValue<KV<String, Integer>> firstKv = WindowedValue.of(KV.of(key, 1), new Instant(3), firstWindow, PaneInfo.NO_FIRING);
WindowedValue<KeyedWorkItem<String, KV<String, Integer>>> gbkOutputElement = firstKv.withValue(KeyedWorkItems.elementsWorkItem("hello", ImmutableList.of(firstKv, firstKv.withValue(KV.of(key, 13)), firstKv.withValue(KV.of(key, 15)))));
CommittedBundle<KeyedWorkItem<String, KV<String, Integer>>> inputBundle = BUNDLE_FACTORY.createBundle((PCollection<KeyedWorkItem<String, KV<String, Integer>>>) Iterables.getOnlyElement(TransformInputs.nonAdditionalInputs(producingTransform))).add(gbkOutputElement).commit(Instant.now());
TransformEvaluator<KeyedWorkItem<String, KV<String, Integer>>> evaluator = factory.forApplication(producingTransform, inputBundle);
evaluator.processElement(gbkOutputElement);
// This should push back every element as a KV<String, Iterable<Integer>>
// in the appropriate window. Since the keys are equal they are single-threaded
TransformResult<KeyedWorkItem<String, KV<String, Integer>>> result = evaluator.finishBundle();
List<Integer> pushedBackInts = new ArrayList<>();
for (WindowedValue<? extends KeyedWorkItem<String, KV<String, Integer>>> unprocessedElement : result.getUnprocessedElements()) {
assertThat(Iterables.getOnlyElement(unprocessedElement.getWindows()), equalTo((BoundedWindow) firstWindow));
assertThat(unprocessedElement.getValue().key(), equalTo("hello"));
for (WindowedValue<KV<String, Integer>> windowedKv : unprocessedElement.getValue().elementsIterable()) {
pushedBackInts.add(windowedKv.getValue().getValue());
}
}
assertThat(pushedBackInts, containsInAnyOrder(1, 13, 15));
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class UnboundedReadEvaluatorFactoryTest method processElement.
private void processElement(final TestUnboundedSource<String> source) throws Exception {
final EvaluationContext context = EvaluationContext.create(MockClock.fromInstant(Instant.now()), CloningBundleFactory.create(), DirectGraph.create(emptyMap(), emptyMap(), LinkedListMultimap.create(), emptySet(), emptyMap()), emptySet(), Executors.newCachedThreadPool());
final UnboundedReadEvaluatorFactory factory = new UnboundedReadEvaluatorFactory(context, p.getOptions());
final SplittableParDo.PrimitiveUnboundedRead<String> unbounded = new SplittableParDo.PrimitiveUnboundedRead(Read.from(source));
final Pipeline pipeline = Pipeline.create(p.getOptions());
final PCollection<String> pCollection = pipeline.apply(unbounded);
final AppliedPTransform<PBegin, PCollection<String>, SplittableParDo.PrimitiveUnboundedRead<String>> application = AppliedPTransform.of("test", new HashMap<>(), singletonMap(new TupleTag(), pCollection), unbounded, ResourceHints.create(), pipeline);
final TransformEvaluator<UnboundedSourceShard<String, TestCheckpointMark>> evaluator = factory.forApplication(application, null);
final UnboundedSource.UnboundedReader<String> reader = source.createReader(p.getOptions(), null);
final UnboundedSourceShard<String, TestCheckpointMark> shard = UnboundedSourceShard.of(source, new NeverDeduplicator(), reader, null);
final WindowedValue<UnboundedSourceShard<String, TestCheckpointMark>> value = WindowedValue.of(shard, BoundedWindow.TIMESTAMP_MAX_VALUE, GlobalWindow.INSTANCE, PaneInfo.NO_FIRING);
TestUnboundedSource.readerClosedCount = 0;
evaluator.processElement(value);
}
Aggregations