Search in sources :

Example 21 with FieldAccessDescriptor

use of org.apache.beam.sdk.schemas.FieldAccessDescriptor in project beam by apache.

the class MongoDbTable method buildIOReader.

@Override
public PCollection<Row> buildIOReader(PBegin begin, BeamSqlTableFilter filters, List<String> fieldNames) {
    MongoDbIO.Read readInstance = MongoDbIO.read().withUri(dbUri).withDatabase(dbName).withCollection(dbCollection);
    final FieldAccessDescriptor resolved = FieldAccessDescriptor.withFieldNames(fieldNames).resolve(getSchema());
    final Schema newSchema = SelectHelpers.getOutputSchema(getSchema(), resolved);
    FindQuery findQuery = FindQuery.create();
    if (!(filters instanceof DefaultTableFilter)) {
        MongoDbFilter mongoFilter = (MongoDbFilter) filters;
        if (!mongoFilter.getSupported().isEmpty()) {
            Bson filter = constructPredicate(mongoFilter.getSupported());
            LOG.info("Pushing down the following filter: " + filter.toString());
            findQuery = findQuery.withFilters(filter);
        }
    }
    if (!fieldNames.isEmpty()) {
        findQuery = findQuery.withProjection(fieldNames);
    }
    readInstance = readInstance.withQueryFn(findQuery);
    return readInstance.expand(begin).apply(DocumentToRow.withSchema(newSchema));
}
Also used : FindQuery(org.apache.beam.sdk.io.mongodb.FindQuery) FieldAccessDescriptor(org.apache.beam.sdk.schemas.FieldAccessDescriptor) Schema(org.apache.beam.sdk.schemas.Schema) DefaultTableFilter(org.apache.beam.sdk.extensions.sql.meta.DefaultTableFilter) MongoDbIO(org.apache.beam.sdk.io.mongodb.MongoDbIO) Bson(org.bson.conversions.Bson)

Example 22 with FieldAccessDescriptor

use of org.apache.beam.sdk.schemas.FieldAccessDescriptor in project beam by apache.

the class PCollectionOutputTagVisitor method visitValue.

@Override
public void visitValue(PValue value, Node producer) {
    for (Entry<ProjectionProducer<PTransform<?, ?>>, Map<PCollection<?>, FieldAccessDescriptor>> entry : pCollFieldAccess.entrySet()) {
        FieldAccessDescriptor fieldAccess = entry.getValue().get(value);
        if (fieldAccess == null) {
            continue;
        }
        BiMap<PCollection<?>, TupleTag<?>> outputs = ImmutableBiMap.copyOf(producer.getOutputs()).inverse();
        TupleTag<?> tag = outputs.get(value);
        Preconditions.checkArgumentNotNull(tag, "PCollection %s not found in outputs of producer %s", value, producer);
        ImmutableMap.Builder<TupleTag<?>, FieldAccessDescriptor> tagEntryBuilder = tagFieldAccess.build().get(entry.getKey());
        if (tagEntryBuilder == null) {
            tagEntryBuilder = ImmutableMap.builder();
            tagFieldAccess.put(entry.getKey(), tagEntryBuilder);
        }
        tagEntryBuilder.put(tag, fieldAccess);
    }
}
Also used : FieldAccessDescriptor(org.apache.beam.sdk.schemas.FieldAccessDescriptor) PCollection(org.apache.beam.sdk.values.PCollection) ProjectionProducer(org.apache.beam.sdk.schemas.ProjectionProducer) TupleTag(org.apache.beam.sdk.values.TupleTag) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap) ImmutableBiMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableBiMap) Map(java.util.Map) BiMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.BiMap) ImmutableMap(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)

Example 23 with FieldAccessDescriptor

use of org.apache.beam.sdk.schemas.FieldAccessDescriptor in project beam by apache.

the class ProjectionPushdownOptimizer method optimize.

/**
 * Performs all known projection pushdown optimizations in-place on a Pipeline.
 *
 * <p>A pushdown optimization is possible wherever there is a {@link ProjectionProducer} that
 * produces a {@link PCollection} that is consumed by one or more PTransforms with an annotated
 * {@link FieldAccessDescriptor}, where the number of fields consumed is less than the number of
 * fields produced. The optimizer replaces the {@link ProjectionProducer} with the result of
 * calling {@link ProjectionProducer#actuateProjectionPushdown(Map)} on that producer with those
 * PCollections/fields.
 *
 * <p>Currently only supports pushdown on {@link ProjectionProducer} instances that are applied
 * directly to {@link PBegin} (https://issues.apache.org/jira/browse/BEAM-13658).
 */
public static void optimize(Pipeline pipeline) {
    // Compute which Schema fields are (or conversely, are not) accessed in a pipeline.
    FieldAccessVisitor fieldAccessVisitor = new FieldAccessVisitor();
    pipeline.traverseTopologically(fieldAccessVisitor);
    // Find transforms in this pipeline which both: 1. support projection pushdown and 2. output
    // unused fields.
    ProjectionProducerVisitor pushdownProjectorVisitor = new ProjectionProducerVisitor(fieldAccessVisitor.getPCollectionFieldAccess());
    pipeline.traverseTopologically(pushdownProjectorVisitor);
    Map<ProjectionProducer<PTransform<?, ?>>, Map<PCollection<?>, FieldAccessDescriptor>> pushdownOpportunities = pushdownProjectorVisitor.getPushdownOpportunities();
    // Translate target PCollections to their output TupleTags.
    PCollectionOutputTagVisitor outputTagVisitor = new PCollectionOutputTagVisitor(pushdownOpportunities);
    pipeline.traverseTopologically(outputTagVisitor);
    Map<ProjectionProducer<PTransform<?, ?>>, Map<TupleTag<?>, FieldAccessDescriptor>> taggedFieldAccess = outputTagVisitor.getTaggedFieldAccess();
    // fields.
    for (Entry<ProjectionProducer<PTransform<?, ?>>, Map<TupleTag<?>, FieldAccessDescriptor>> entry : taggedFieldAccess.entrySet()) {
        for (Entry<TupleTag<?>, FieldAccessDescriptor> outputFields : entry.getValue().entrySet()) {
            LOG.info("Optimizing transform {}: output {} will contain reduced field set {}", entry.getKey(), outputFields.getKey(), outputFields.getValue().fieldNamesAccessed());
        }
        PTransformMatcher matcher = application -> application.getTransform() == entry.getKey();
        PushdownOverrideFactory<?, ?> overrideFactory = new PushdownOverrideFactory<>(entry.getValue());
        pipeline.replaceAll(ImmutableList.of(PTransformOverride.of(matcher, overrideFactory)));
    }
}
Also used : Preconditions(org.apache.beam.sdk.util.Preconditions) PBegin(org.apache.beam.sdk.values.PBegin) Logger(org.slf4j.Logger) ProjectionProducer(org.apache.beam.sdk.schemas.ProjectionProducer) LoggerFactory(org.slf4j.LoggerFactory) PTransformOverride(org.apache.beam.sdk.runners.PTransformOverride) PCollection(org.apache.beam.sdk.values.PCollection) Collectors(java.util.stream.Collectors) PTransform(org.apache.beam.sdk.transforms.PTransform) POutput(org.apache.beam.sdk.values.POutput) PTransformOverrideFactory(org.apache.beam.sdk.runners.PTransformOverrideFactory) TupleTag(org.apache.beam.sdk.values.TupleTag) Map(java.util.Map) FieldAccessDescriptor(org.apache.beam.sdk.schemas.FieldAccessDescriptor) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) Entry(java.util.Map.Entry) TaggedPValue(org.apache.beam.sdk.values.TaggedPValue) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Pipeline(org.apache.beam.sdk.Pipeline) PTransformMatcher(org.apache.beam.sdk.runners.PTransformMatcher) SimpleEntry(java.util.AbstractMap.SimpleEntry) AppliedPTransform(org.apache.beam.sdk.runners.AppliedPTransform) FieldAccessDescriptor(org.apache.beam.sdk.schemas.FieldAccessDescriptor) PTransformMatcher(org.apache.beam.sdk.runners.PTransformMatcher) TupleTag(org.apache.beam.sdk.values.TupleTag) ProjectionProducer(org.apache.beam.sdk.schemas.ProjectionProducer) Map(java.util.Map)

Example 24 with FieldAccessDescriptor

use of org.apache.beam.sdk.schemas.FieldAccessDescriptor in project beam by apache.

the class FieldAccessVisitor method visitPrimitiveTransform.

@Override
public void visitPrimitiveTransform(Node node) {
    Map<PCollection<?>, FieldAccessDescriptor> currentFieldAccess = getFieldAccess(node);
    for (Entry<PCollection<?>, FieldAccessDescriptor> entry : currentFieldAccess.entrySet()) {
        FieldAccessDescriptor previousFieldAccess = pCollectionFieldAccess.get(entry.getKey());
        FieldAccessDescriptor newFieldAccess = previousFieldAccess == null ? entry.getValue() : FieldAccessDescriptor.union(ImmutableList.of(previousFieldAccess, entry.getValue()));
        pCollectionFieldAccess.put(entry.getKey(), newFieldAccess);
    }
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) FieldAccessDescriptor(org.apache.beam.sdk.schemas.FieldAccessDescriptor)

Example 25 with FieldAccessDescriptor

use of org.apache.beam.sdk.schemas.FieldAccessDescriptor in project beam by apache.

the class FieldAccessVisitor method getFieldAccess.

private static Map<PCollection<?>, FieldAccessDescriptor> getFieldAccess(Node node) {
    PTransform<?, ?> transform = node.getTransform();
    HashMap<PCollection<?>, FieldAccessDescriptor> access = new HashMap<>();
    if (transform instanceof MultiOutput) {
        // Get main input pcoll.
        Set<PCollection<?>> mainInputs = node.getInputs().entrySet().stream().filter((entry) -> !transform.getAdditionalInputs().containsKey(entry.getKey())).map(Entry::getValue).collect(Collectors.toSet());
        PCollection<?> mainInput = Iterables.getOnlyElement(mainInputs);
        // Get field access.
        DoFn<?, ?> fn = ((MultiOutput<?, ?>) transform).getFn();
        FieldAccessDescriptor fields = ParDo.getDoFnSchemaInformation(fn, mainInput).getFieldAccessDescriptor();
        // Record field access.
        access.put(mainInput, fields);
    }
    // For every input without field access info, we must assume all fields need to be accessed.
    for (PCollection<?> input : node.getInputs().values()) {
        if (!access.containsKey(input)) {
            access.put(input, FieldAccessDescriptor.withAllFields());
        }
    }
    return ImmutableMap.copyOf(access);
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) FieldAccessDescriptor(org.apache.beam.sdk.schemas.FieldAccessDescriptor) HashMap(java.util.HashMap) MultiOutput(org.apache.beam.sdk.transforms.ParDo.MultiOutput)

Aggregations

FieldAccessDescriptor (org.apache.beam.sdk.schemas.FieldAccessDescriptor)65 Test (org.junit.Test)49 Row (org.apache.beam.sdk.values.Row)47 Schema (org.apache.beam.sdk.schemas.Schema)42 PCollection (org.apache.beam.sdk.values.PCollection)16 Map (java.util.Map)12 Pipeline (org.apache.beam.sdk.Pipeline)11 ProjectionProducer (org.apache.beam.sdk.schemas.ProjectionProducer)9 ImmutableMap (org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableMap)8 ParDo (org.apache.beam.sdk.transforms.ParDo)5 DoFnSchemaInformation (org.apache.beam.sdk.transforms.DoFnSchemaInformation)4 PBegin (org.apache.beam.sdk.values.PBegin)4 DefaultTableFilter (org.apache.beam.sdk.extensions.sql.meta.DefaultTableFilter)3 FieldType (org.apache.beam.sdk.schemas.Schema.FieldType)3 PTransform (org.apache.beam.sdk.transforms.PTransform)3 List (java.util.List)2 Collectors (java.util.stream.Collectors)2 AutoValueSchema (org.apache.beam.sdk.schemas.AutoValueSchema)2 FieldDescriptor (org.apache.beam.sdk.schemas.FieldAccessDescriptor.FieldDescriptor)2 Field (org.apache.beam.sdk.schemas.Schema.Field)2