use of org.apache.beam.sdk.coders.Coder in project beam by apache.
the class IsmSideInputReaderTest method testIsmReaderReferenceCaching.
@Test
public void testIsmReaderReferenceCaching() throws Exception {
Coder<WindowedValue<Long>> valueCoder = WindowedValue.getFullCoder(VarLongCoder.of(), GLOBAL_WINDOW_CODER);
final WindowedValue<Long> element = valueInGlobalWindow(42L);
final PCollectionView<Long> view = Pipeline.create().apply(Create.empty(VarLongCoder.of())).apply(View.asSingleton());
final Source source = initInputFile(fromValues(Arrays.asList(element)), IsmRecordCoder.of(1, 0, ImmutableList.<Coder<?>>of(GLOBAL_WINDOW_CODER), valueCoder));
final Source emptySource = initInputFile(fromValues(Arrays.asList()), IsmRecordCoder.of(1, 0, ImmutableList.<Coder<?>>of(GLOBAL_WINDOW_CODER), valueCoder));
final IsmSideInputReader reader = sideInputReader(view.getTagInternal().getId(), source, emptySource);
assertTrue(reader.tagToIsmReaderMap.containsKey(view.getTagInternal()));
assertEquals(1, reader.tagToIsmReaderMap.get(view.getTagInternal()).size());
assertEquals(FileSystems.matchSingleFileSpec(getString(source.getSpec(), WorkerPropertyNames.FILENAME)).resourceId(), reader.tagToIsmReaderMap.get(view.getTagInternal()).get(0).getResourceId());
assertTrue(reader.tagToEmptyIsmReaderMap.containsKey(view.getTagInternal()));
assertEquals(1, reader.tagToEmptyIsmReaderMap.get(view.getTagInternal()).size());
assertEquals(FileSystems.matchSingleFileSpec(getString(emptySource.getSpec(), WorkerPropertyNames.FILENAME)).resourceId(), reader.tagToEmptyIsmReaderMap.get(view.getTagInternal()).get(0).getResourceId());
}
use of org.apache.beam.sdk.coders.Coder in project beam by apache.
the class IsmReaderFactoryTest method testFactory.
@Test
public void testFactory() throws Exception {
WindowedValueCoder<?> coder = WindowedValue.getFullCoder(IsmRecordCoder.of(1, 0, ImmutableList.<Coder<?>>of(StringUtf8Coder.of()), VarLongCoder.of()), GlobalWindow.Coder.INSTANCE);
String tmpFile = tmpFolder.newFile().getPath();
ResourceId tmpResourceId = FileSystems.matchSingleFileSpec(tmpFile).resourceId();
@SuppressWarnings("rawtypes") IsmReader<?> ismReader = (IsmReader) new IsmReaderFactory().create(createSpecForFilename(tmpFile), coder, options, executionContext, operationContext);
assertEquals(coder.getValueCoder(), ismReader.getCoder());
assertEquals(tmpResourceId, ismReader.getResourceId());
}
use of org.apache.beam.sdk.coders.Coder in project beam by apache.
the class UserParDoFnFactory method create.
@Override
public ParDoFn create(PipelineOptions options, CloudObject cloudUserFn, @Nullable List<SideInputInfo> sideInputInfos, TupleTag<?> mainOutputTag, Map<TupleTag<?>, Integer> outputTupleTagsToReceiverIndices, DataflowExecutionContext<?> executionContext, DataflowOperationContext operationContext) throws Exception {
DoFnInstanceManager instanceManager = fnCache.get(operationContext.nameContext().systemName(), () -> DoFnInstanceManagers.cloningPool(doFnExtractor.getDoFnInfo(cloudUserFn), options));
DoFnInfo<?, ?> doFnInfo = instanceManager.peek();
DataflowExecutionContext.DataflowStepContext stepContext = executionContext.getStepContext(operationContext);
Iterable<PCollectionView<?>> sideInputViews = doFnInfo.getSideInputViews();
SideInputReader sideInputReader = executionContext.getSideInputReader(sideInputInfos, sideInputViews, operationContext);
if (doFnInfo.getDoFn() instanceof BatchStatefulParDoOverrides.BatchStatefulDoFn) {
// HACK: BatchStatefulDoFn is a class from DataflowRunner's overrides
// that just instructs the worker to execute it differently. This will
// be replaced by metadata in the Runner API payload
BatchStatefulParDoOverrides.BatchStatefulDoFn fn = (BatchStatefulParDoOverrides.BatchStatefulDoFn) doFnInfo.getDoFn();
DoFn underlyingFn = fn.getUnderlyingDoFn();
return new BatchModeUngroupingParDoFn((BatchModeExecutionContext.StepContext) stepContext, new SimpleParDoFn(options, DoFnInstanceManagers.singleInstance(doFnInfo.withFn(underlyingFn)), sideInputReader, doFnInfo.getMainOutput(), outputTupleTagsToReceiverIndices, stepContext, operationContext, doFnInfo.getDoFnSchemaInformation(), doFnInfo.getSideInputMapping(), runnerFactory));
} else if (doFnInfo.getDoFn() instanceof StreamingPCollectionViewWriterFn) {
// HACK: StreamingPCollectionViewWriterFn is a class from
// DataflowPipelineTranslator. Using the class as an indicator is a migration path
// to simply having an indicator string.
checkArgument(stepContext instanceof StreamingModeExecutionContext.StreamingModeStepContext, "stepContext must be a StreamingModeStepContext to use StreamingPCollectionViewWriterFn");
DataflowRunner.StreamingPCollectionViewWriterFn<Object> writerFn = (StreamingPCollectionViewWriterFn<Object>) doFnInfo.getDoFn();
return new StreamingPCollectionViewWriterParDoFn((StreamingModeExecutionContext.StreamingModeStepContext) stepContext, writerFn.getView().getTagInternal(), writerFn.getDataCoder(), (Coder<BoundedWindow>) doFnInfo.getWindowingStrategy().getWindowFn().windowCoder());
} else {
return new SimpleParDoFn(options, instanceManager, sideInputReader, doFnInfo.getMainOutput(), outputTupleTagsToReceiverIndices, stepContext, operationContext, doFnInfo.getDoFnSchemaInformation(), doFnInfo.getSideInputMapping(), runnerFactory);
}
}
use of org.apache.beam.sdk.coders.Coder in project beam by apache.
the class IsmSinkFactory method create.
@Override
public Sink<?> create(CloudObject spec, @Nullable Coder<?> coder, @Nullable PipelineOptions options, @Nullable DataflowExecutionContext executionContext, DataflowOperationContext operationContext) throws Exception {
options = checkArgumentNotNull(options);
coder = checkArgumentNotNull(coder);
// The validity of this coder is checked in detail by the typed create, below
@SuppressWarnings("unchecked") Coder<WindowedValue<IsmRecord<Object>>> typedCoder = (Coder<WindowedValue<IsmRecord<Object>>>) coder;
String filename = getString(spec, WorkerPropertyNames.FILENAME);
checkArgument(typedCoder instanceof WindowedValueCoder, "%s only supports using %s but got %s.", IsmSink.class, WindowedValueCoder.class, typedCoder);
WindowedValueCoder<IsmRecord<Object>> windowedCoder = (WindowedValueCoder<IsmRecord<Object>>) typedCoder;
checkArgument(windowedCoder.getValueCoder() instanceof IsmRecordCoder, "%s only supports using %s but got %s.", IsmSink.class, IsmRecordCoder.class, windowedCoder.getValueCoder());
@SuppressWarnings("unchecked") IsmRecordCoder<Object> ismCoder = (IsmRecordCoder<Object>) windowedCoder.getValueCoder();
long bloomFilterSizeLimitBytes = Math.max(MIN_BLOOM_FILTER_SIZE_BYTES, DoubleMath.roundToLong(BLOOM_FILTER_SIZE_LIMIT_MULTIPLIER * options.as(DataflowWorkerHarnessOptions.class).getWorkerCacheMb() * // Note the conversion from MiB to bytes
1024 * 1024, RoundingMode.DOWN));
return new IsmSink<>(FileSystems.matchNewResource(filename, false), ismCoder, bloomFilterSizeLimitBytes);
}
use of org.apache.beam.sdk.coders.Coder in project beam by apache.
the class FlinkStreamingPortablePipelineTranslator method translateFlatten.
private <T> void translateFlatten(String id, RunnerApi.Pipeline pipeline, StreamingTranslationContext context) {
RunnerApi.PTransform transform = pipeline.getComponents().getTransformsOrThrow(id);
Map<String, String> allInputs = transform.getInputsMap();
if (allInputs.isEmpty()) {
// create an empty dummy source to satisfy downstream operations
// we cannot create an empty source in Flink, therefore we have to
// add the flatMap that simply never forwards the single element
long shutdownAfterIdleSourcesMs = context.getPipelineOptions().getShutdownSourcesAfterIdleMs();
DataStreamSource<WindowedValue<byte[]>> dummySource = context.getExecutionEnvironment().addSource(new ImpulseSourceFunction(shutdownAfterIdleSourcesMs));
DataStream<WindowedValue<T>> result = dummySource.<WindowedValue<T>>flatMap((s, collector) -> {
// never return anything
}).returns(new CoderTypeInformation<>(WindowedValue.getFullCoder((Coder<T>) VoidCoder.of(), GlobalWindow.Coder.INSTANCE), context.getPipelineOptions()));
context.addDataStream(Iterables.getOnlyElement(transform.getOutputsMap().values()), result);
} else {
DataStream<T> result = null;
// Determine DataStreams that we use as input several times. For those, we need to uniquify
// input streams because Flink seems to swallow watermarks when we have a union of one and
// the same stream.
HashMultiset<DataStream<T>> inputCounts = HashMultiset.create();
for (String input : allInputs.values()) {
DataStream<T> current = context.getDataStreamOrThrow(input);
inputCounts.add(current, 1);
}
for (String input : allInputs.values()) {
DataStream<T> current = context.getDataStreamOrThrow(input);
final int timesRequired = inputCounts.count(current);
if (timesRequired > 1) {
current = current.flatMap(new FlatMapFunction<T, T>() {
private static final long serialVersionUID = 1L;
@Override
public void flatMap(T t, Collector<T> collector) {
collector.collect(t);
}
});
}
result = (result == null) ? current : result.union(current);
}
context.addDataStream(Iterables.getOnlyElement(transform.getOutputsMap().values()), result);
}
}
Aggregations