Search in sources :

Example 81 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class PCollectionOutputTagVisitor method visitValue.

@Override
public void visitValue(PValue value, Node producer) {
    for (Entry<ProjectionProducer<PTransform<?, ?>>, Map<PCollection<?>, FieldAccessDescriptor>> entry : pCollFieldAccess.entrySet()) {
        FieldAccessDescriptor fieldAccess = entry.getValue().get(value);
        if (fieldAccess == null) {
            continue;
        }
        BiMap<PCollection<?>, TupleTag<?>> outputs = ImmutableBiMap.copyOf(producer.getOutputs()).inverse();
        TupleTag<?> tag = outputs.get(value);
        Preconditions.checkArgumentNotNull(tag, "PCollection %s not found in outputs of producer %s", value, producer);
        ImmutableMap.Builder<TupleTag<?>, FieldAccessDescriptor> tagEntryBuilder = tagFieldAccess.build().get(entry.getKey());
        if (tagEntryBuilder == null) {
            tagEntryBuilder = ImmutableMap.builder();
            tagFieldAccess.put(entry.getKey(), tagEntryBuilder);
        }
        tagEntryBuilder.put(tag, fieldAccess);
    }
}
Also used : FieldAccessDescriptor(org.apache.beam.sdk.schemas.FieldAccessDescriptor) PCollection(org.apache.beam.sdk.values.PCollection) ProjectionProducer(org.apache.beam.sdk.schemas.ProjectionProducer) TupleTag(org.apache.beam.sdk.values.TupleTag) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) ImmutableBiMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableBiMap) Map(java.util.Map) BiMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.BiMap) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)

Example 82 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class ProjectionPushdownOptimizer method optimize.

/**
 * Performs all known projection pushdown optimizations in-place on a Pipeline.
 *
 * <p>A pushdown optimization is possible wherever there is a {@link ProjectionProducer} that
 * produces a {@link PCollection} that is consumed by one or more PTransforms with an annotated
 * {@link FieldAccessDescriptor}, where the number of fields consumed is less than the number of
 * fields produced. The optimizer replaces the {@link ProjectionProducer} with the result of
 * calling {@link ProjectionProducer#actuateProjectionPushdown(Map)} on that producer with those
 * PCollections/fields.
 *
 * <p>Currently only supports pushdown on {@link ProjectionProducer} instances that are applied
 * directly to {@link PBegin} (https://issues.apache.org/jira/browse/BEAM-13658).
 */
public static void optimize(Pipeline pipeline) {
    // Compute which Schema fields are (or conversely, are not) accessed in a pipeline.
    FieldAccessVisitor fieldAccessVisitor = new FieldAccessVisitor();
    pipeline.traverseTopologically(fieldAccessVisitor);
    // Find transforms in this pipeline which both: 1. support projection pushdown and 2. output
    // unused fields.
    ProjectionProducerVisitor pushdownProjectorVisitor = new ProjectionProducerVisitor(fieldAccessVisitor.getPCollectionFieldAccess());
    pipeline.traverseTopologically(pushdownProjectorVisitor);
    Map<ProjectionProducer<PTransform<?, ?>>, Map<PCollection<?>, FieldAccessDescriptor>> pushdownOpportunities = pushdownProjectorVisitor.getPushdownOpportunities();
    // Translate target PCollections to their output TupleTags.
    PCollectionOutputTagVisitor outputTagVisitor = new PCollectionOutputTagVisitor(pushdownOpportunities);
    pipeline.traverseTopologically(outputTagVisitor);
    Map<ProjectionProducer<PTransform<?, ?>>, Map<TupleTag<?>, FieldAccessDescriptor>> taggedFieldAccess = outputTagVisitor.getTaggedFieldAccess();
    // fields.
    for (Entry<ProjectionProducer<PTransform<?, ?>>, Map<TupleTag<?>, FieldAccessDescriptor>> entry : taggedFieldAccess.entrySet()) {
        for (Entry<TupleTag<?>, FieldAccessDescriptor> outputFields : entry.getValue().entrySet()) {
            LOG.info("Optimizing transform {}: output {} will contain reduced field set {}", entry.getKey(), outputFields.getKey(), outputFields.getValue().fieldNamesAccessed());
        }
        PTransformMatcher matcher = application -> application.getTransform() == entry.getKey();
        PushdownOverrideFactory<?, ?> overrideFactory = new PushdownOverrideFactory<>(entry.getValue());
        pipeline.replaceAll(ImmutableList.of(PTransformOverride.of(matcher, overrideFactory)));
    }
}
Also used : Preconditions(org.apache.beam.sdk.util.Preconditions) PBegin(org.apache.beam.sdk.values.PBegin) Logger(org.slf4j.Logger) ProjectionProducer(org.apache.beam.sdk.schemas.ProjectionProducer) LoggerFactory(org.slf4j.LoggerFactory) PTransformOverride(org.apache.beam.sdk.runners.PTransformOverride) PCollection(org.apache.beam.sdk.values.PCollection) Collectors(java.util.stream.Collectors) PTransform(org.apache.beam.sdk.transforms.PTransform) POutput(org.apache.beam.sdk.values.POutput) PTransformOverrideFactory(org.apache.beam.sdk.runners.PTransformOverrideFactory) TupleTag(org.apache.beam.sdk.values.TupleTag) Map(java.util.Map) FieldAccessDescriptor(org.apache.beam.sdk.schemas.FieldAccessDescriptor) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) Entry(java.util.Map.Entry) TaggedPValue(org.apache.beam.sdk.values.TaggedPValue) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Pipeline(org.apache.beam.sdk.Pipeline) PTransformMatcher(org.apache.beam.sdk.runners.PTransformMatcher) SimpleEntry(java.util.AbstractMap.SimpleEntry) AppliedPTransform(org.apache.beam.sdk.runners.AppliedPTransform) FieldAccessDescriptor(org.apache.beam.sdk.schemas.FieldAccessDescriptor) PTransformMatcher(org.apache.beam.sdk.runners.PTransformMatcher) TupleTag(org.apache.beam.sdk.values.TupleTag) ProjectionProducer(org.apache.beam.sdk.schemas.ProjectionProducer) Map(java.util.Map)

Example 83 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class PTransformMatchersTest method emptyFlattenWithNonFlatten.

@Test
public void emptyFlattenWithNonFlatten() {
    AppliedPTransform application = AppliedPTransform.<PCollection<Iterable<Integer>>, PCollection<Integer>, Flatten.Iterables<Integer>>of("EmptyFlatten", Collections.emptyMap(), Collections.singletonMap(new TupleTag<Integer>(), PCollection.createPrimitiveOutputInternal(p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())), /* This isn't actually possible to construct, but for the sake of example */
    Flatten.iterables(), ResourceHints.create(), p);
    assertThat(PTransformMatchers.emptyFlatten().matches(application), is(false));
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) AppliedPTransform(org.apache.beam.sdk.runners.AppliedPTransform) TupleTag(org.apache.beam.sdk.values.TupleTag) Test(org.junit.Test)

Example 84 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class PTransformMatchersTest method flattenWithDuplicateInputsNonFlatten.

@Test
public void flattenWithDuplicateInputsNonFlatten() {
    AppliedPTransform application = AppliedPTransform.<PCollection<Iterable<Integer>>, PCollection<Integer>, Flatten.Iterables<Integer>>of("EmptyFlatten", Collections.emptyMap(), Collections.singletonMap(new TupleTag<Integer>(), PCollection.createPrimitiveOutputInternal(p, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarIntCoder.of())), /* This isn't actually possible to construct, but for the sake of example */
    Flatten.iterables(), ResourceHints.create(), p);
    assertThat(PTransformMatchers.flattenWithDuplicateInputs().matches(application), is(false));
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) AppliedPTransform(org.apache.beam.sdk.runners.AppliedPTransform) TupleTag(org.apache.beam.sdk.values.TupleTag) Test(org.junit.Test)

Example 85 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class PTransformTranslationTest method multiMultiParDo.

private static AppliedPTransform<?, ?, ?> multiMultiParDo(Pipeline pipeline) {
    PCollectionView<String> view = pipeline.apply(Create.of("foo")).apply(View.asSingleton());
    PCollection<Long> input = pipeline.apply(GenerateSequence.from(0));
    ParDo.MultiOutput<Long, KV<Long, String>> parDo = ParDo.of(new TestDoFn()).withSideInputs(view).withOutputTags(new TupleTag<KV<Long, String>>() {
    }, TupleTagList.of(new TupleTag<KV<String, Long>>() {
    }));
    PCollectionTuple output = input.apply(parDo);
    Map<TupleTag<?>, PCollection<?>> inputs = new HashMap<>();
    inputs.putAll(PValues.fullyExpand(parDo.getAdditionalInputs()));
    inputs.putAll(PValues.expandInput(input));
    return AppliedPTransform.<PCollection<Long>, PCollectionTuple, ParDo.MultiOutput<Long, KV<Long, String>>>of("MultiParDoInAndOut", inputs, PValues.expandOutput(output), parDo, ResourceHints.create(), pipeline);
}
Also used : HashMap(java.util.HashMap) TupleTag(org.apache.beam.sdk.values.TupleTag) KV(org.apache.beam.sdk.values.KV) PCollection(org.apache.beam.sdk.values.PCollection) ParDo(org.apache.beam.sdk.transforms.ParDo) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple)

Aggregations

PCollection (org.apache.beam.sdk.values.PCollection)199 Test (org.junit.Test)133 KV (org.apache.beam.sdk.values.KV)62 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)61 Map (java.util.Map)59 List (java.util.List)58 Rule (org.junit.Rule)57 RunWith (org.junit.runner.RunWith)54 PAssert (org.apache.beam.sdk.testing.PAssert)52 Instant (org.joda.time.Instant)46 Duration (org.joda.time.Duration)45 JUnit4 (org.junit.runners.JUnit4)45 ParDo (org.apache.beam.sdk.transforms.ParDo)44 TupleTag (org.apache.beam.sdk.values.TupleTag)42 Pipeline (org.apache.beam.sdk.Pipeline)41 Create (org.apache.beam.sdk.transforms.Create)41 ArrayList (java.util.ArrayList)40 Serializable (java.io.Serializable)39 PTransform (org.apache.beam.sdk.transforms.PTransform)37 Row (org.apache.beam.sdk.values.Row)37