use of org.apache.beam.sdk.schemas.FieldAccessDescriptor in project beam by apache.
the class MongoDbTable method buildIOReader.
@Override
public PCollection<Row> buildIOReader(PBegin begin, BeamSqlTableFilter filters, List<String> fieldNames) {
MongoDbIO.Read readInstance = MongoDbIO.read().withUri(dbUri).withDatabase(dbName).withCollection(dbCollection);
final FieldAccessDescriptor resolved = FieldAccessDescriptor.withFieldNames(fieldNames).resolve(getSchema());
final Schema newSchema = SelectHelpers.getOutputSchema(getSchema(), resolved);
FindQuery findQuery = FindQuery.create();
if (!(filters instanceof DefaultTableFilter)) {
MongoDbFilter mongoFilter = (MongoDbFilter) filters;
if (!mongoFilter.getSupported().isEmpty()) {
Bson filter = constructPredicate(mongoFilter.getSupported());
LOG.info("Pushing down the following filter: " + filter.toString());
findQuery = findQuery.withFilters(filter);
}
}
if (!fieldNames.isEmpty()) {
findQuery = findQuery.withProjection(fieldNames);
}
readInstance = readInstance.withQueryFn(findQuery);
return readInstance.expand(begin).apply(DocumentToRow.withSchema(newSchema));
}
use of org.apache.beam.sdk.schemas.FieldAccessDescriptor in project beam by apache.
the class PCollectionOutputTagVisitor method visitValue.
@Override
public void visitValue(PValue value, Node producer) {
for (Entry<ProjectionProducer<PTransform<?, ?>>, Map<PCollection<?>, FieldAccessDescriptor>> entry : pCollFieldAccess.entrySet()) {
FieldAccessDescriptor fieldAccess = entry.getValue().get(value);
if (fieldAccess == null) {
continue;
}
BiMap<PCollection<?>, TupleTag<?>> outputs = ImmutableBiMap.copyOf(producer.getOutputs()).inverse();
TupleTag<?> tag = outputs.get(value);
Preconditions.checkArgumentNotNull(tag, "PCollection %s not found in outputs of producer %s", value, producer);
ImmutableMap.Builder<TupleTag<?>, FieldAccessDescriptor> tagEntryBuilder = tagFieldAccess.build().get(entry.getKey());
if (tagEntryBuilder == null) {
tagEntryBuilder = ImmutableMap.builder();
tagFieldAccess.put(entry.getKey(), tagEntryBuilder);
}
tagEntryBuilder.put(tag, fieldAccess);
}
}
use of org.apache.beam.sdk.schemas.FieldAccessDescriptor in project beam by apache.
the class ProjectionPushdownOptimizer method optimize.
/**
* Performs all known projection pushdown optimizations in-place on a Pipeline.
*
* <p>A pushdown optimization is possible wherever there is a {@link ProjectionProducer} that
* produces a {@link PCollection} that is consumed by one or more PTransforms with an annotated
* {@link FieldAccessDescriptor}, where the number of fields consumed is less than the number of
* fields produced. The optimizer replaces the {@link ProjectionProducer} with the result of
* calling {@link ProjectionProducer#actuateProjectionPushdown(Map)} on that producer with those
* PCollections/fields.
*
* <p>Currently only supports pushdown on {@link ProjectionProducer} instances that are applied
* directly to {@link PBegin} (https://issues.apache.org/jira/browse/BEAM-13658).
*/
public static void optimize(Pipeline pipeline) {
// Compute which Schema fields are (or conversely, are not) accessed in a pipeline.
FieldAccessVisitor fieldAccessVisitor = new FieldAccessVisitor();
pipeline.traverseTopologically(fieldAccessVisitor);
// Find transforms in this pipeline which both: 1. support projection pushdown and 2. output
// unused fields.
ProjectionProducerVisitor pushdownProjectorVisitor = new ProjectionProducerVisitor(fieldAccessVisitor.getPCollectionFieldAccess());
pipeline.traverseTopologically(pushdownProjectorVisitor);
Map<ProjectionProducer<PTransform<?, ?>>, Map<PCollection<?>, FieldAccessDescriptor>> pushdownOpportunities = pushdownProjectorVisitor.getPushdownOpportunities();
// Translate target PCollections to their output TupleTags.
PCollectionOutputTagVisitor outputTagVisitor = new PCollectionOutputTagVisitor(pushdownOpportunities);
pipeline.traverseTopologically(outputTagVisitor);
Map<ProjectionProducer<PTransform<?, ?>>, Map<TupleTag<?>, FieldAccessDescriptor>> taggedFieldAccess = outputTagVisitor.getTaggedFieldAccess();
// fields.
for (Entry<ProjectionProducer<PTransform<?, ?>>, Map<TupleTag<?>, FieldAccessDescriptor>> entry : taggedFieldAccess.entrySet()) {
for (Entry<TupleTag<?>, FieldAccessDescriptor> outputFields : entry.getValue().entrySet()) {
LOG.info("Optimizing transform {}: output {} will contain reduced field set {}", entry.getKey(), outputFields.getKey(), outputFields.getValue().fieldNamesAccessed());
}
PTransformMatcher matcher = application -> application.getTransform() == entry.getKey();
PushdownOverrideFactory<?, ?> overrideFactory = new PushdownOverrideFactory<>(entry.getValue());
pipeline.replaceAll(ImmutableList.of(PTransformOverride.of(matcher, overrideFactory)));
}
}
use of org.apache.beam.sdk.schemas.FieldAccessDescriptor in project beam by apache.
the class FieldAccessVisitor method visitPrimitiveTransform.
@Override
public void visitPrimitiveTransform(Node node) {
Map<PCollection<?>, FieldAccessDescriptor> currentFieldAccess = getFieldAccess(node);
for (Entry<PCollection<?>, FieldAccessDescriptor> entry : currentFieldAccess.entrySet()) {
FieldAccessDescriptor previousFieldAccess = pCollectionFieldAccess.get(entry.getKey());
FieldAccessDescriptor newFieldAccess = previousFieldAccess == null ? entry.getValue() : FieldAccessDescriptor.union(ImmutableList.of(previousFieldAccess, entry.getValue()));
pCollectionFieldAccess.put(entry.getKey(), newFieldAccess);
}
}
use of org.apache.beam.sdk.schemas.FieldAccessDescriptor in project beam by apache.
the class FieldAccessVisitor method getFieldAccess.
private static Map<PCollection<?>, FieldAccessDescriptor> getFieldAccess(Node node) {
PTransform<?, ?> transform = node.getTransform();
HashMap<PCollection<?>, FieldAccessDescriptor> access = new HashMap<>();
if (transform instanceof MultiOutput) {
// Get main input pcoll.
Set<PCollection<?>> mainInputs = node.getInputs().entrySet().stream().filter((entry) -> !transform.getAdditionalInputs().containsKey(entry.getKey())).map(Entry::getValue).collect(Collectors.toSet());
PCollection<?> mainInput = Iterables.getOnlyElement(mainInputs);
// Get field access.
DoFn<?, ?> fn = ((MultiOutput<?, ?>) transform).getFn();
FieldAccessDescriptor fields = ParDo.getDoFnSchemaInformation(fn, mainInput).getFieldAccessDescriptor();
// Record field access.
access.put(mainInput, fields);
}
// For every input without field access info, we must assume all fields need to be accessed.
for (PCollection<?> input : node.getInputs().values()) {
if (!access.containsKey(input)) {
access.put(input, FieldAccessDescriptor.withAllFields());
}
}
return ImmutableMap.copyOf(access);
}
Aggregations