use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class PCollectionOutputTagVisitor method visitValue.
@Override
public void visitValue(PValue value, Node producer) {
for (Entry<ProjectionProducer<PTransform<?, ?>>, Map<PCollection<?>, FieldAccessDescriptor>> entry : pCollFieldAccess.entrySet()) {
FieldAccessDescriptor fieldAccess = entry.getValue().get(value);
if (fieldAccess == null) {
continue;
}
BiMap<PCollection<?>, TupleTag<?>> outputs = ImmutableBiMap.copyOf(producer.getOutputs()).inverse();
TupleTag<?> tag = outputs.get(value);
Preconditions.checkArgumentNotNull(tag, "PCollection %s not found in outputs of producer %s", value, producer);
ImmutableMap.Builder<TupleTag<?>, FieldAccessDescriptor> tagEntryBuilder = tagFieldAccess.build().get(entry.getKey());
if (tagEntryBuilder == null) {
tagEntryBuilder = ImmutableMap.builder();
tagFieldAccess.put(entry.getKey(), tagEntryBuilder);
}
tagEntryBuilder.put(tag, fieldAccess);
}
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class ProjectionPushdownOptimizer method optimize.
/**
* Performs all known projection pushdown optimizations in-place on a Pipeline.
*
* <p>A pushdown optimization is possible wherever there is a {@link ProjectionProducer} that
* produces a {@link PCollection} that is consumed by one or more PTransforms with an annotated
* {@link FieldAccessDescriptor}, where the number of fields consumed is less than the number of
* fields produced. The optimizer replaces the {@link ProjectionProducer} with the result of
* calling {@link ProjectionProducer#actuateProjectionPushdown(Map)} on that producer with those
* PCollections/fields.
*
* <p>Currently only supports pushdown on {@link ProjectionProducer} instances that are applied
* directly to {@link PBegin} (https://issues.apache.org/jira/browse/BEAM-13658).
*/
public static void optimize(Pipeline pipeline) {
// Compute which Schema fields are (or conversely, are not) accessed in a pipeline.
FieldAccessVisitor fieldAccessVisitor = new FieldAccessVisitor();
pipeline.traverseTopologically(fieldAccessVisitor);
// Find transforms in this pipeline which both: 1. support projection pushdown and 2. output
// unused fields.
ProjectionProducerVisitor pushdownProjectorVisitor = new ProjectionProducerVisitor(fieldAccessVisitor.getPCollectionFieldAccess());
pipeline.traverseTopologically(pushdownProjectorVisitor);
Map<ProjectionProducer<PTransform<?, ?>>, Map<PCollection<?>, FieldAccessDescriptor>> pushdownOpportunities = pushdownProjectorVisitor.getPushdownOpportunities();
// Translate target PCollections to their output TupleTags.
PCollectionOutputTagVisitor outputTagVisitor = new PCollectionOutputTagVisitor(pushdownOpportunities);
pipeline.traverseTopologically(outputTagVisitor);
Map<ProjectionProducer<PTransform<?, ?>>, Map<TupleTag<?>, FieldAccessDescriptor>> taggedFieldAccess = outputTagVisitor.getTaggedFieldAccess();
// fields.
for (Entry<ProjectionProducer<PTransform<?, ?>>, Map<TupleTag<?>, FieldAccessDescriptor>> entry : taggedFieldAccess.entrySet()) {
for (Entry<TupleTag<?>, FieldAccessDescriptor> outputFields : entry.getValue().entrySet()) {
LOG.info("Optimizing transform {}: output {} will contain reduced field set {}", entry.getKey(), outputFields.getKey(), outputFields.getValue().fieldNamesAccessed());
}
PTransformMatcher matcher = application -> application.getTransform() == entry.getKey();
PushdownOverrideFactory<?, ?> overrideFactory = new PushdownOverrideFactory<>(entry.getValue());
pipeline.replaceAll(ImmutableList.of(PTransformOverride.of(matcher, overrideFactory)));
}
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class PTransformMatchersTest method emptyFlattenWithNonFlatten.
@Test
public void emptyFlattenWithNonFlatten() {
AppliedPTransform application = AppliedPTransform.<PCollection<Iterable<Integer>>, PCollection<Integer>, Flatten.Iterables<Integer>>of("EmptyFlatten", Collections.emptyMap(), Collections.singletonMap(new TupleTag<Integer>(), PCollection.createPrimitiveOutputInternal(p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())), /* This isn't actually possible to construct, but for the sake of example */
Flatten.iterables(), ResourceHints.create(), p);
assertThat(PTransformMatchers.emptyFlatten().matches(application), is(false));
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class PTransformMatchersTest method flattenWithDuplicateInputsNonFlatten.
@Test
public void flattenWithDuplicateInputsNonFlatten() {
AppliedPTransform application = AppliedPTransform.<PCollection<Iterable<Integer>>, PCollection<Integer>, Flatten.Iterables<Integer>>of("EmptyFlatten", Collections.emptyMap(), Collections.singletonMap(new TupleTag<Integer>(), PCollection.createPrimitiveOutputInternal(p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())), /* This isn't actually possible to construct, but for the sake of example */
Flatten.iterables(), ResourceHints.create(), p);
assertThat(PTransformMatchers.flattenWithDuplicateInputs().matches(application), is(false));
}
use of org.apache.beam.sdk.values.PCollection in project beam by apache.
the class PTransformTranslationTest method multiMultiParDo.
private static AppliedPTransform<?, ?, ?> multiMultiParDo(Pipeline pipeline) {
PCollectionView<String> view = pipeline.apply(Create.of("foo")).apply(View.asSingleton());
PCollection<Long> input = pipeline.apply(GenerateSequence.from(0));
ParDo.MultiOutput<Long, KV<Long, String>> parDo = ParDo.of(new TestDoFn()).withSideInputs(view).withOutputTags(new TupleTag<KV<Long, String>>() {
}, TupleTagList.of(new TupleTag<KV<String, Long>>() {
}));
PCollectionTuple output = input.apply(parDo);
Map<TupleTag<?>, PCollection<?>> inputs = new HashMap<>();
inputs.putAll(PValues.fullyExpand(parDo.getAdditionalInputs()));
inputs.putAll(PValues.expandInput(input));
return AppliedPTransform.<PCollection<Long>, PCollectionTuple, ParDo.MultiOutput<Long, KV<Long, String>>>of("MultiParDoInAndOut", inputs, PValues.expandOutput(output), parDo, ResourceHints.create(), pipeline);
}
Aggregations