use of org.apache.beam.sdk.values.TupleTag in project beam by apache.
the class LazilyInitializedSideInputReaderTest method testLazyInitialization.
@Test
public void testLazyInitialization() {
final AtomicInteger wasCalled = new AtomicInteger();
SideInputReader lazilyInitializedSideInputReader = new LazilyInitializedSideInputReader(ImmutableList.of(new SideInputInfo().setTag(TEST_TAG)), () -> {
wasCalled.incrementAndGet();
return mockSideInputReader;
});
// Ensure that after construction we have not been initialized yet.
assertEquals(0, wasCalled.get());
// Ensure that after checking some basic tag information we have not been initialized yet.
assertFalse(lazilyInitializedSideInputReader.isEmpty());
assertEquals(0, wasCalled.get());
when(mockPCollectionView.getTagInternal()).thenReturn(new TupleTag(TEST_TAG));
assertTrue(lazilyInitializedSideInputReader.contains(mockPCollectionView));
assertEquals(0, wasCalled.get());
// Ensure that we were constructed only once, and provided the expected parameters and returned
// the expected result.
when(mockSideInputReader.get(any(PCollectionView.class), any(BoundedWindow.class))).thenReturn(42).thenReturn(43);
assertEquals(42, lazilyInitializedSideInputReader.get(mockPCollectionView, GlobalWindow.INSTANCE));
assertEquals(1, wasCalled.get());
assertEquals(43, lazilyInitializedSideInputReader.get(mockPCollectionView, GlobalWindow.INSTANCE));
assertEquals(1, wasCalled.get());
}
use of org.apache.beam.sdk.values.TupleTag in project beam by apache.
the class BatchGroupAlsoByWindowReshuffleDoFnTest method makeRunner.
private static <K, InputT, OutputT, W extends BoundedWindow> DoFnRunner<KV<K, Iterable<WindowedValue<InputT>>>, KV<K, OutputT>> makeRunner(GroupAlsoByWindowDoFnFactory<K, InputT, OutputT> fnFactory, WindowingStrategy<?, W> windowingStrategy, TupleTag<KV<K, OutputT>> outputTag, DoFnRunners.OutputManager outputManager) {
final StepContext stepContext = new TestStepContext(STEP_NAME);
StateInternalsFactory<K> stateInternalsFactory = key -> stepContext.stateInternals();
BatchGroupAlsoByWindowFn<K, InputT, OutputT> fn = fnFactory.forStrategy(windowingStrategy, stateInternalsFactory);
return new GroupAlsoByWindowFnRunner<>(PipelineOptionsFactory.create(), fn, NullSideInputReader.empty(), outputManager, outputTag, stepContext);
}
use of org.apache.beam.sdk.values.TupleTag in project beam by apache.
the class SdkHarnessClientTest method setup.
@Before
public void setup() throws Exception {
MockitoAnnotations.initMocks(this);
sdkHarnessClient = SdkHarnessClient.usingFnApiClient(fnApiControlClient, dataService);
Pipeline userPipeline = Pipeline.create();
TupleTag<String> outputTag = new TupleTag<>();
userPipeline.apply("create", Create.of("foo")).apply("proc", ParDo.of(new TestFn()).withOutputTags(outputTag, TupleTagList.empty()));
RunnerApi.Pipeline userProto = PipelineTranslation.toProto(userPipeline);
ProcessBundleDescriptor.Builder pbdBuilder = ProcessBundleDescriptor.newBuilder().setId("my_id").putAllEnvironments(userProto.getComponents().getEnvironmentsMap()).putAllWindowingStrategies(userProto.getComponents().getWindowingStrategiesMap()).putAllCoders(userProto.getComponents().getCodersMap());
RunnerApi.Coder fullValueCoder = CoderTranslation.toProto(WindowedValue.getFullCoder(StringUtf8Coder.of(), Coder.INSTANCE)).getCoder();
pbdBuilder.putCoders("wire_coder", fullValueCoder);
PTransform targetProcessor = userProto.getComponents().getTransformsOrThrow("proc");
RemoteGrpcPort port = RemoteGrpcPort.newBuilder().setApiServiceDescriptor(harness.dataEndpoint()).setCoderId("wire_coder").build();
RemoteGrpcPortRead readNode = RemoteGrpcPortRead.readFromPort(port, getOnlyElement(targetProcessor.getInputsMap().values()));
RemoteGrpcPortWrite writeNode = RemoteGrpcPortWrite.writeToPort(getOnlyElement(targetProcessor.getOutputsMap().values()), port);
// TODO: Ensure cross-env (Runner <-> SDK GRPC Read/Write Node) coders are length-prefixed
for (String pc : targetProcessor.getInputsMap().values()) {
pbdBuilder.putPcollections(pc, userProto.getComponents().getPcollectionsOrThrow(pc));
}
for (String pc : targetProcessor.getOutputsMap().values()) {
pbdBuilder.putPcollections(pc, userProto.getComponents().getPcollectionsOrThrow(pc));
}
pbdBuilder.putTransforms("proc", targetProcessor).putTransforms(SDK_GRPC_READ_TRANSFORM, readNode.toPTransform()).putTransforms(SDK_GRPC_WRITE_TRANSFORM, writeNode.toPTransform());
descriptor = pbdBuilder.build();
}
use of org.apache.beam.sdk.values.TupleTag in project beam by apache.
the class SplittableParDoProcessKeyedElementsOp method open.
@Override
public void open(Config config, Context context, Scheduler<KeyedTimerData<byte[]>> timerRegistry, OpEmitter<RawUnionValue> emitter) {
this.pipelineOptions = Base64Serializer.deserializeUnchecked(config.get("beamPipelineOptions"), SerializablePipelineOptions.class).get().as(SamzaPipelineOptions.class);
final SamzaStoreStateInternals.Factory<?> nonKeyedStateInternalsFactory = SamzaStoreStateInternals.createNonKeyedStateInternalsFactory(transformId, context.getTaskContext(), pipelineOptions);
final DoFnRunners.OutputManager outputManager = outputManagerFactory.create(emitter);
this.stateInternalsFactory = new SamzaStoreStateInternals.Factory<>(transformId, Collections.singletonMap(SamzaStoreStateInternals.BEAM_STORE, SamzaStoreStateInternals.getBeamStore(context.getTaskContext())), ByteArrayCoder.of(), pipelineOptions.getStoreBatchGetSize());
this.timerInternalsFactory = SamzaTimerInternalsFactory.createTimerInternalFactory(ByteArrayCoder.of(), timerRegistry, TIMER_STATE_ID, nonKeyedStateInternalsFactory, windowingStrategy, isBounded, pipelineOptions);
final KeyedInternals<byte[]> keyedInternals = new KeyedInternals<>(stateInternalsFactory, timerInternalsFactory);
SplittableParDoViaKeyedWorkItems.ProcessFn<InputT, OutputT, RestrictionT, PositionT, WatermarkEstimatorStateT> processFn = processElements.newProcessFn(processElements.getFn());
DoFnInvokers.tryInvokeSetupFor(processFn, pipelineOptions);
processFn.setStateInternalsFactory(stateInternalsFactory);
processFn.setTimerInternalsFactory(timerInternalsFactory);
processFn.setSideInputReader(NullSideInputReader.empty());
processFn.setProcessElementInvoker(new OutputAndTimeBoundedSplittableProcessElementInvoker<>(processElements.getFn(), pipelineOptions, new OutputWindowedValue<OutputT>() {
@Override
public void outputWindowedValue(OutputT output, Instant timestamp, Collection<? extends BoundedWindow> windows, PaneInfo pane) {
outputWindowedValue(mainOutputTag, output, timestamp, windows, pane);
}
@Override
public <AdditionalOutputT> void outputWindowedValue(TupleTag<AdditionalOutputT> tag, AdditionalOutputT output, Instant timestamp, Collection<? extends BoundedWindow> windows, PaneInfo pane) {
outputManager.output(tag, WindowedValue.of(output, timestamp, windows, pane));
}
}, NullSideInputReader.empty(), Executors.newSingleThreadScheduledExecutor(Executors.defaultThreadFactory()), 10000, Duration.standardSeconds(10), () -> {
throw new UnsupportedOperationException("BundleFinalizer unsupported in Samza");
}));
final StepContext stepContext = new StepContext() {
@Override
public StateInternals stateInternals() {
return keyedInternals.stateInternals();
}
@Override
public TimerInternals timerInternals() {
return keyedInternals.timerInternals();
}
};
this.fnRunner = DoFnRunners.simpleRunner(pipelineOptions, processFn, NullSideInputReader.of(Collections.emptyList()), outputManager, mainOutputTag, Collections.emptyList(), stepContext, null, Collections.emptyMap(), windowingStrategy, DoFnSchemaInformation.create(), Collections.emptyMap());
}
use of org.apache.beam.sdk.values.TupleTag in project beam by apache.
the class StreamingTransformTranslator method flattenPColl.
private static <T> TransformEvaluator<Flatten.PCollections<T>> flattenPColl() {
return new TransformEvaluator<Flatten.PCollections<T>>() {
@SuppressWarnings("unchecked")
@Override
public void evaluate(Flatten.PCollections<T> transform, EvaluationContext context) {
Map<TupleTag<?>, PCollection<?>> pcs = context.getInputs(transform);
// since this is a streaming pipeline, at least one of the PCollections to "flatten" are
// unbounded, meaning it represents a DStream.
// So we could end up with an unbounded unified DStream.
final List<JavaDStream<WindowedValue<T>>> dStreams = new ArrayList<>();
final List<Integer> streamingSources = new ArrayList<>();
for (PValue pv : pcs.values()) {
checkArgument(pv instanceof PCollection, "Flatten had non-PCollection value in input: %s of type %s", pv, pv.getClass().getSimpleName());
PCollection<T> pcol = (PCollection<T>) pv;
Dataset dataset = context.borrowDataset(pcol);
if (dataset instanceof UnboundedDataset) {
UnboundedDataset<T> unboundedDataset = (UnboundedDataset<T>) dataset;
streamingSources.addAll(unboundedDataset.getStreamSources());
dStreams.add(unboundedDataset.getDStream());
} else {
// create a single RDD stream.
Queue<JavaRDD<WindowedValue<T>>> q = new LinkedBlockingQueue<>();
q.offer(((BoundedDataset) dataset).getRDD());
// TODO (BEAM-10789): this is not recoverable from checkpoint!
JavaDStream<WindowedValue<T>> dStream = context.getStreamingContext().queueStream(q);
dStreams.add(dStream);
}
}
// start by unifying streams into a single stream.
JavaDStream<WindowedValue<T>> unifiedStreams = SparkCompat.joinStreams(context.getStreamingContext(), dStreams);
context.putDataset(transform, new UnboundedDataset<>(unifiedStreams, streamingSources));
}
@Override
public String toNativeString() {
return "streamingContext.union(...)";
}
};
}
Aggregations