use of org.apache.beam.sdk.schemas.ProjectionProducer in project beam by apache.
the class ProjectionProducerVisitorTest method testPushdownProducersWithMultipleOutputs_returnsMultiplePushdowns.
@Test
public void testPushdownProducersWithMultipleOutputs_returnsMultiplePushdowns() {
Pipeline p = Pipeline.create();
PTransform<PBegin, PCollectionTuple> source = new MultipleOutputSourceWithPushdown();
PCollectionTuple outputs = p.apply(source);
Map<PCollection<?>, FieldAccessDescriptor> pCollectionFieldAccess = ImmutableMap.of(outputs.get("output1"), FieldAccessDescriptor.withFieldNames("field1", "field2"), outputs.get("output2"), FieldAccessDescriptor.withFieldNames("field3", "field4"));
ProjectionProducerVisitor visitor = new ProjectionProducerVisitor(pCollectionFieldAccess);
p.traverseTopologically(visitor);
Map<ProjectionProducer<PTransform<?, ?>>, Map<PCollection<?>, FieldAccessDescriptor>> pushdownOpportunities = visitor.getPushdownOpportunities();
Assert.assertEquals(1, pushdownOpportunities.size());
Map<PCollection<?>, FieldAccessDescriptor> opportunitiesForSource = pushdownOpportunities.get(source);
Assert.assertNotNull(opportunitiesForSource);
Assert.assertEquals(2, opportunitiesForSource.size());
FieldAccessDescriptor fieldAccessDescriptor1 = opportunitiesForSource.get(outputs.get("output1"));
Assert.assertNotNull(fieldAccessDescriptor1);
Assert.assertFalse(fieldAccessDescriptor1.getAllFields());
assertThat(fieldAccessDescriptor1.fieldNamesAccessed(), containsInAnyOrder("field1", "field2"));
FieldAccessDescriptor fieldAccessDescriptor2 = opportunitiesForSource.get(outputs.get("output2"));
Assert.assertNotNull(fieldAccessDescriptor2);
Assert.assertFalse(fieldAccessDescriptor2.getAllFields());
assertThat(fieldAccessDescriptor2.fieldNamesAccessed(), containsInAnyOrder("field3", "field4"));
}
use of org.apache.beam.sdk.schemas.ProjectionProducer in project beam by apache.
the class PCollectionOutputTagVisitor method visitValue.
@Override
public void visitValue(PValue value, Node producer) {
for (Entry<ProjectionProducer<PTransform<?, ?>>, Map<PCollection<?>, FieldAccessDescriptor>> entry : pCollFieldAccess.entrySet()) {
FieldAccessDescriptor fieldAccess = entry.getValue().get(value);
if (fieldAccess == null) {
continue;
}
BiMap<PCollection<?>, TupleTag<?>> outputs = ImmutableBiMap.copyOf(producer.getOutputs()).inverse();
TupleTag<?> tag = outputs.get(value);
Preconditions.checkArgumentNotNull(tag, "PCollection %s not found in outputs of producer %s", value, producer);
ImmutableMap.Builder<TupleTag<?>, FieldAccessDescriptor> tagEntryBuilder = tagFieldAccess.build().get(entry.getKey());
if (tagEntryBuilder == null) {
tagEntryBuilder = ImmutableMap.builder();
tagFieldAccess.put(entry.getKey(), tagEntryBuilder);
}
tagEntryBuilder.put(tag, fieldAccess);
}
}
use of org.apache.beam.sdk.schemas.ProjectionProducer in project beam by apache.
the class ProjectionPushdownOptimizer method optimize.
/**
* Performs all known projection pushdown optimizations in-place on a Pipeline.
*
* <p>A pushdown optimization is possible wherever there is a {@link ProjectionProducer} that
* produces a {@link PCollection} that is consumed by one or more PTransforms with an annotated
* {@link FieldAccessDescriptor}, where the number of fields consumed is less than the number of
* fields produced. The optimizer replaces the {@link ProjectionProducer} with the result of
* calling {@link ProjectionProducer#actuateProjectionPushdown(Map)} on that producer with those
* PCollections/fields.
*
* <p>Currently only supports pushdown on {@link ProjectionProducer} instances that are applied
* directly to {@link PBegin} (https://issues.apache.org/jira/browse/BEAM-13658).
*/
public static void optimize(Pipeline pipeline) {
// Compute which Schema fields are (or conversely, are not) accessed in a pipeline.
FieldAccessVisitor fieldAccessVisitor = new FieldAccessVisitor();
pipeline.traverseTopologically(fieldAccessVisitor);
// Find transforms in this pipeline which both: 1. support projection pushdown and 2. output
// unused fields.
ProjectionProducerVisitor pushdownProjectorVisitor = new ProjectionProducerVisitor(fieldAccessVisitor.getPCollectionFieldAccess());
pipeline.traverseTopologically(pushdownProjectorVisitor);
Map<ProjectionProducer<PTransform<?, ?>>, Map<PCollection<?>, FieldAccessDescriptor>> pushdownOpportunities = pushdownProjectorVisitor.getPushdownOpportunities();
// Translate target PCollections to their output TupleTags.
PCollectionOutputTagVisitor outputTagVisitor = new PCollectionOutputTagVisitor(pushdownOpportunities);
pipeline.traverseTopologically(outputTagVisitor);
Map<ProjectionProducer<PTransform<?, ?>>, Map<TupleTag<?>, FieldAccessDescriptor>> taggedFieldAccess = outputTagVisitor.getTaggedFieldAccess();
// fields.
for (Entry<ProjectionProducer<PTransform<?, ?>>, Map<TupleTag<?>, FieldAccessDescriptor>> entry : taggedFieldAccess.entrySet()) {
for (Entry<TupleTag<?>, FieldAccessDescriptor> outputFields : entry.getValue().entrySet()) {
LOG.info("Optimizing transform {}: output {} will contain reduced field set {}", entry.getKey(), outputFields.getKey(), outputFields.getValue().fieldNamesAccessed());
}
PTransformMatcher matcher = application -> application.getTransform() == entry.getKey();
PushdownOverrideFactory<?, ?> overrideFactory = new PushdownOverrideFactory<>(entry.getValue());
pipeline.replaceAll(ImmutableList.of(PTransformOverride.of(matcher, overrideFactory)));
}
}
use of org.apache.beam.sdk.schemas.ProjectionProducer in project beam by apache.
the class ProjectionProducerVisitor method enterCompositeTransform.
@Override
public CompositeBehavior enterCompositeTransform(Node node) {
PTransform<?, ?> transform = node.getTransform();
// TODO(BEAM-13658) Support inputs other than PBegin.
if (!node.getInputs().isEmpty()) {
return CompositeBehavior.DO_NOT_ENTER_TRANSFORM;
}
if (!(transform instanceof ProjectionProducer)) {
return CompositeBehavior.ENTER_TRANSFORM;
}
ProjectionProducer<PTransform<?, ?>> pushdownProjector = (ProjectionProducer<PTransform<?, ?>>) transform;
if (!pushdownProjector.supportsProjectionPushdown()) {
return CompositeBehavior.ENTER_TRANSFORM;
}
ImmutableMap.Builder<PCollection<?>, FieldAccessDescriptor> builder = ImmutableMap.builder();
for (PCollection<?> output : node.getOutputs().values()) {
FieldAccessDescriptor fieldAccess = pCollectionFieldAccess.get(output);
if (fieldAccess != null && !fieldAccess.getAllFields()) {
builder.put(output, fieldAccess);
}
}
Map<PCollection<?>, FieldAccessDescriptor> localOpportunities = builder.build();
if (localOpportunities.isEmpty()) {
return CompositeBehavior.ENTER_TRANSFORM;
}
pushdownOpportunities.put(pushdownProjector, localOpportunities);
// If there are nested PushdownProjector implementations, apply only the outermost one.
return CompositeBehavior.DO_NOT_ENTER_TRANSFORM;
}
use of org.apache.beam.sdk.schemas.ProjectionProducer in project beam by apache.
the class ProjectionProducerVisitorTest method testMissingFieldAccessInformation_returnsNoPushdown.
@Test
public void testMissingFieldAccessInformation_returnsNoPushdown() {
Pipeline p = Pipeline.create();
p.apply(new SimpleSource());
Map<PCollection<?>, FieldAccessDescriptor> pCollectionFieldAccess = ImmutableMap.of();
ProjectionProducerVisitor visitor = new ProjectionProducerVisitor(pCollectionFieldAccess);
p.traverseTopologically(visitor);
Map<ProjectionProducer<PTransform<?, ?>>, Map<PCollection<?>, FieldAccessDescriptor>> pushdownOpportunities = visitor.getPushdownOpportunities();
Assert.assertTrue(pushdownOpportunities.isEmpty());
}
Aggregations