use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class ProjectionProducerVisitorTest method testPushdownProducersWithMultipleOutputs_returnsMultiplePushdowns.
@Test
public void testPushdownProducersWithMultipleOutputs_returnsMultiplePushdowns() {
Pipeline p = Pipeline.create();
PTransform<PBegin, PCollectionTuple> source = new MultipleOutputSourceWithPushdown();
PCollectionTuple outputs = p.apply(source);
Map<PCollection<?>, FieldAccessDescriptor> pCollectionFieldAccess = ImmutableMap.of(outputs.get("output1"), FieldAccessDescriptor.withFieldNames("field1", "field2"), outputs.get("output2"), FieldAccessDescriptor.withFieldNames("field3", "field4"));
ProjectionProducerVisitor visitor = new ProjectionProducerVisitor(pCollectionFieldAccess);
p.traverseTopologically(visitor);
Map<ProjectionProducer<PTransform<?, ?>>, Map<PCollection<?>, FieldAccessDescriptor>> pushdownOpportunities = visitor.getPushdownOpportunities();
Assert.assertEquals(1, pushdownOpportunities.size());
Map<PCollection<?>, FieldAccessDescriptor> opportunitiesForSource = pushdownOpportunities.get(source);
Assert.assertNotNull(opportunitiesForSource);
Assert.assertEquals(2, opportunitiesForSource.size());
FieldAccessDescriptor fieldAccessDescriptor1 = opportunitiesForSource.get(outputs.get("output1"));
Assert.assertNotNull(fieldAccessDescriptor1);
Assert.assertFalse(fieldAccessDescriptor1.getAllFields());
assertThat(fieldAccessDescriptor1.fieldNamesAccessed(), containsInAnyOrder("field1", "field2"));
FieldAccessDescriptor fieldAccessDescriptor2 = opportunitiesForSource.get(outputs.get("output2"));
Assert.assertNotNull(fieldAccessDescriptor2);
Assert.assertFalse(fieldAccessDescriptor2.getAllFields());
assertThat(fieldAccessDescriptor2.fieldNamesAccessed(), containsInAnyOrder("field3", "field4"));
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class RemoteExecutionTest method testExecutionWithMultipleStages.
@Test
public void testExecutionWithMultipleStages() throws Exception {
launchSdkHarness(PipelineOptionsFactory.create());
Pipeline p = Pipeline.create();
Function<String, PCollection<String>> pCollectionGenerator = suffix -> p.apply("impulse" + suffix, Impulse.create()).apply("create" + suffix, ParDo.of(new DoFn<byte[], String>() {
@ProcessElement
public void process(ProcessContext c) {
try {
c.output(CoderUtils.decodeFromByteArray(StringUtf8Coder.of(), c.element()));
} catch (CoderException e) {
throw new RuntimeException(e);
}
}
})).setCoder(StringUtf8Coder.of()).apply(ParDo.of(new DoFn<String, String>() {
@ProcessElement
public void processElement(ProcessContext c) {
c.output("stream" + suffix + c.element());
}
}));
PCollection<String> input1 = pCollectionGenerator.apply("1");
PCollection<String> input2 = pCollectionGenerator.apply("2");
PCollection<String> outputMerged = PCollectionList.of(input1).and(input2).apply(Flatten.pCollections());
outputMerged.apply("createKV", ParDo.of(new DoFn<String, KV<String, String>>() {
@ProcessElement
public void process(ProcessContext c) {
c.output(KV.of(c.element(), ""));
}
})).setCoder(KvCoder.of(StringUtf8Coder.of(), StringUtf8Coder.of())).apply("gbk", GroupByKey.create());
RunnerApi.Pipeline pipelineProto = PipelineTranslation.toProto(p);
FusedPipeline fused = GreedyPipelineFuser.fuse(pipelineProto);
Set<ExecutableStage> stages = fused.getFusedStages();
assertThat(stages.size(), equalTo(2));
List<WindowedValue<?>> outputValues = Collections.synchronizedList(new ArrayList<>());
for (ExecutableStage stage : stages) {
ExecutableProcessBundleDescriptor descriptor = ProcessBundleDescriptors.fromExecutableStage(stage.toString(), stage, dataServer.getApiServiceDescriptor(), stateServer.getApiServiceDescriptor());
BundleProcessor processor = controlClient.getProcessor(descriptor.getProcessBundleDescriptor(), descriptor.getRemoteInputDestinations(), stateDelegator);
Map<String, Coder> remoteOutputCoders = descriptor.getRemoteOutputCoders();
Map<String, RemoteOutputReceiver<?>> outputReceivers = new HashMap<>();
for (Entry<String, Coder> remoteOutputCoder : remoteOutputCoders.entrySet()) {
outputReceivers.putIfAbsent(remoteOutputCoder.getKey(), RemoteOutputReceiver.of((Coder<WindowedValue<?>>) remoteOutputCoder.getValue(), outputValues::add));
}
try (RemoteBundle bundle = processor.newBundle(outputReceivers, StateRequestHandler.unsupported(), BundleProgressHandler.ignored())) {
Iterables.getOnlyElement(bundle.getInputReceivers().values()).accept(valueInGlobalWindow(CoderUtils.encodeToByteArray(StringUtf8Coder.of(), "X")));
}
}
assertThat(outputValues, containsInAnyOrder(valueInGlobalWindow(KV.of("stream1X", "")), valueInGlobalWindow(KV.of("stream2X", ""))));
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class StreamingTransformTranslator method flattenPColl.
private static <T> TransformEvaluator<Flatten.PCollections<T>> flattenPColl() {
return new TransformEvaluator<Flatten.PCollections<T>>() {
@SuppressWarnings("unchecked")
@Override
public void evaluate(Flatten.PCollections<T> transform, EvaluationContext context) {
Map<TupleTag<?>, PCollection<?>> pcs = context.getInputs(transform);
// since this is a streaming pipeline, at least one of the PCollections to "flatten" are
// unbounded, meaning it represents a DStream.
// So we could end up with an unbounded unified DStream.
final List<JavaDStream<WindowedValue<T>>> dStreams = new ArrayList<>();
final List<Integer> streamingSources = new ArrayList<>();
for (PValue pv : pcs.values()) {
checkArgument(pv instanceof PCollection, "Flatten had non-PCollection value in input: %s of type %s", pv, pv.getClass().getSimpleName());
PCollection<T> pcol = (PCollection<T>) pv;
Dataset dataset = context.borrowDataset(pcol);
if (dataset instanceof UnboundedDataset) {
UnboundedDataset<T> unboundedDataset = (UnboundedDataset<T>) dataset;
streamingSources.addAll(unboundedDataset.getStreamSources());
dStreams.add(unboundedDataset.getDStream());
} else {
// create a single RDD stream.
Queue<JavaRDD<WindowedValue<T>>> q = new LinkedBlockingQueue<>();
q.offer(((BoundedDataset) dataset).getRDD());
// TODO (BEAM-10789): this is not recoverable from checkpoint!
JavaDStream<WindowedValue<T>> dStream = context.getStreamingContext().queueStream(q);
dStreams.add(dStream);
}
}
// start by unifying streams into a single stream.
JavaDStream<WindowedValue<T>> unifiedStreams = SparkCompat.joinStreams(context.getStreamingContext(), dStreams);
context.putDataset(transform, new UnboundedDataset<>(unifiedStreams, streamingSources));
}
@Override
public String toNativeString() {
return "streamingContext.union(...)";
}
};
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class StreamingTransformTranslator method parDo.
private static <InputT, OutputT> TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>> parDo() {
return new TransformEvaluator<ParDo.MultiOutput<InputT, OutputT>>() {
@Override
public void evaluate(final ParDo.MultiOutput<InputT, OutputT> transform, final EvaluationContext context) {
final DoFn<InputT, OutputT> doFn = transform.getFn();
checkArgument(!DoFnSignatures.signatureForDoFn(doFn).processElement().isSplittable(), "Splittable DoFn not yet supported in streaming mode: %s", doFn);
rejectStateAndTimers(doFn);
final SerializablePipelineOptions options = context.getSerializableOptions();
final SparkPCollectionView pviews = context.getPViews();
final WindowingStrategy<?, ?> windowingStrategy = context.getInput(transform).getWindowingStrategy();
Coder<InputT> inputCoder = (Coder<InputT>) context.getInput(transform).getCoder();
Map<TupleTag<?>, Coder<?>> outputCoders = context.getOutputCoders();
@SuppressWarnings("unchecked") UnboundedDataset<InputT> unboundedDataset = (UnboundedDataset<InputT>) context.borrowDataset(transform);
JavaDStream<WindowedValue<InputT>> dStream = unboundedDataset.getDStream();
final DoFnSchemaInformation doFnSchemaInformation = ParDoTranslation.getSchemaInformation(context.getCurrentTransform());
final Map<String, PCollectionView<?>> sideInputMapping = ParDoTranslation.getSideInputMapping(context.getCurrentTransform());
final String stepName = context.getCurrentTransform().getFullName();
JavaPairDStream<TupleTag<?>, WindowedValue<?>> all = dStream.transformToPair(rdd -> {
final MetricsContainerStepMapAccumulator metricsAccum = MetricsAccumulator.getInstance();
final Map<TupleTag<?>, KV<WindowingStrategy<?, ?>, SideInputBroadcast<?>>> sideInputs = TranslationUtils.getSideInputs(transform.getSideInputs().values(), JavaSparkContext.fromSparkContext(rdd.context()), pviews);
return rdd.mapPartitionsToPair(new MultiDoFnFunction<>(metricsAccum, stepName, doFn, options, transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), inputCoder, outputCoders, sideInputs, windowingStrategy, false, doFnSchemaInformation, sideInputMapping));
});
Map<TupleTag<?>, PCollection<?>> outputs = context.getOutputs(transform);
if (outputs.size() > 1) {
// Caching can cause Serialization, we need to code to bytes
// more details in https://issues.apache.org/jira/browse/BEAM-2669
Map<TupleTag<?>, Coder<WindowedValue<?>>> coderMap = TranslationUtils.getTupleTagCoders(outputs);
all = all.mapToPair(TranslationUtils.getTupleTagEncodeFunction(coderMap)).cache().mapToPair(TranslationUtils.getTupleTagDecodeFunction(coderMap));
}
for (Map.Entry<TupleTag<?>, PCollection<?>> output : outputs.entrySet()) {
@SuppressWarnings("unchecked") JavaPairDStream<TupleTag<?>, WindowedValue<?>> filtered = all.filter(new TranslationUtils.TupleTagFilter(output.getKey()));
@SuppressWarnings("unchecked") JavaDStream<WindowedValue<Object>> // Object is the best we can do since different outputs can have different tags
values = (JavaDStream<WindowedValue<Object>>) (JavaDStream<?>) TranslationUtils.dStreamValues(filtered);
context.putDataset(output.getValue(), new UnboundedDataset<>(values, unboundedDataset.getStreamSources()));
}
}
@Override
public String toNativeString() {
return "mapPartitions(new <fn>())";
}
};
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class SqlTransform method toTableMap.
@SuppressWarnings("unchecked")
private Map<String, BeamSqlTable> toTableMap(PInput inputs) {
/**
* A single PCollection is transformed to a table named PCOLLECTION, other input types are
* expanded and converted to tables using the tags as names.
*/
if (inputs instanceof PCollection) {
PCollection<?> pCollection = (PCollection<?>) inputs;
return ImmutableMap.of(PCOLLECTION_NAME, new BeamPCollectionTable(pCollection));
}
ImmutableMap.Builder<String, BeamSqlTable> tables = ImmutableMap.builder();
for (Map.Entry<TupleTag<?>, PValue> input : inputs.expand().entrySet()) {
PCollection<?> pCollection = (PCollection<?>) input.getValue();
tables.put(input.getKey().getId(), new BeamPCollectionTable(pCollection));
}
return tables.build();
}
Aggregations