Search in sources :

Example 6 with PTransform

use of org.apache.beam.sdk.transforms.PTransform in project beam by apache.

the class TransformHierarchyTest method visitAfterReplace.

/**
 * Tests that visiting the {@link TransformHierarchy} after replacing nodes does not visit any of
 * the original nodes or inaccessible values but does visit all of the replacement nodes, new
 * inaccessible replacement values, and the original output values.
 */
@Test
public void visitAfterReplace() {
    Node root = hierarchy.getCurrent();
    final SingleOutput<Long, Long> originalParDo = ParDo.of(new DoFn<Long, Long>() {

        @ProcessElement
        public void processElement(ProcessContext ctxt) {
            ctxt.output(ctxt.element() + 1L);
        }
    });
    GenerateSequence genUpstream = GenerateSequence.from(0);
    PCollection<Long> upstream = pipeline.apply(genUpstream);
    PCollection<Long> output = upstream.apply("Original", originalParDo);
    Node upstreamNode = hierarchy.pushNode("Upstream", pipeline.begin(), genUpstream);
    hierarchy.finishSpecifyingInput();
    hierarchy.setOutput(upstream);
    hierarchy.popNode();
    Node original = hierarchy.pushNode("Original", upstream, originalParDo);
    hierarchy.finishSpecifyingInput();
    hierarchy.setOutput(output);
    hierarchy.popNode();
    final TupleTag<Long> longs = new TupleTag<>();
    final MultiOutput<Long, Long> replacementParDo = ParDo.of(new DoFn<Long, Long>() {

        @ProcessElement
        public void processElement(ProcessContext ctxt) {
            ctxt.output(ctxt.element() + 1L);
        }
    }).withOutputTags(longs, TupleTagList.empty());
    PTransform<PCollection<Long>, PCollection<Long>> replacementComposite = new PTransform<PCollection<Long>, PCollection<Long>>() {

        @Override
        public PCollection<Long> expand(PCollection<Long> input) {
            return input.apply("Contained", replacementParDo).get(longs);
        }
    };
    PCollectionTuple replacementOutput = upstream.apply("Contained", replacementParDo);
    Node compositeNode = hierarchy.replaceNode(original, upstream, replacementComposite);
    Node replacementParNode = hierarchy.pushNode("Original/Contained", upstream, replacementParDo);
    hierarchy.finishSpecifyingInput();
    hierarchy.setOutput(replacementOutput);
    hierarchy.popNode();
    hierarchy.setOutput(replacementOutput.get(longs));
    Map<TupleTag<?>, PCollection<?>> expandedReplacementOutput = (Map) replacementOutput.expand();
    Entry<TupleTag<?>, PCollection<?>> replacementLongs = Iterables.getOnlyElement(expandedReplacementOutput.entrySet());
    hierarchy.replaceOutputs(Collections.singletonMap(replacementOutput.get(longs), ReplacementOutput.of(TaggedPValue.ofExpandedValue(output), TaggedPValue.of(replacementLongs.getKey(), replacementLongs.getValue()))));
    hierarchy.popNode();
    final Set<Node> visitedCompositeNodes = new HashSet<>();
    final Set<Node> visitedPrimitiveNodes = new HashSet<>();
    Set<PValue> visitedValues = hierarchy.visit(new Defaults() {

        @Override
        public CompositeBehavior enterCompositeTransform(Node node) {
            visitedCompositeNodes.add(node);
            return CompositeBehavior.ENTER_TRANSFORM;
        }

        @Override
        public void visitPrimitiveTransform(Node node) {
            visitedPrimitiveNodes.add(node);
        }
    });
    /*
    Final Graph:
    Upstream -> Upstream.out -> Composite -> (ReplacementParDo -> OriginalParDo.out)
    */
    assertThat(visitedCompositeNodes, containsInAnyOrder(root, compositeNode));
    assertThat(visitedPrimitiveNodes, containsInAnyOrder(upstreamNode, replacementParNode));
    assertThat(visitedValues, containsInAnyOrder(upstream, output));
}
Also used : Node(org.apache.beam.sdk.runners.TransformHierarchy.Node) TupleTag(org.apache.beam.sdk.values.TupleTag) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) PTransform(org.apache.beam.sdk.transforms.PTransform) HashSet(java.util.HashSet) TaggedPValue(org.apache.beam.sdk.values.TaggedPValue) PValue(org.apache.beam.sdk.values.PValue) GenerateSequence(org.apache.beam.sdk.io.GenerateSequence) PCollection(org.apache.beam.sdk.values.PCollection) DoFn(org.apache.beam.sdk.transforms.DoFn) Defaults(org.apache.beam.sdk.Pipeline.PipelineVisitor.Defaults) Map(java.util.Map) Test(org.junit.Test)

Example 7 with PTransform

use of org.apache.beam.sdk.transforms.PTransform in project beam by apache.

the class JavaClassLookupTransformProvider method findMappingConstructor.

private Constructor<PTransform<InputT, OutputT>> findMappingConstructor(Constructor<?>[] constructors, JavaClassLookupPayload payload) {
    Row constructorRow = decodeRow(payload.getConstructorSchema(), payload.getConstructorPayload());
    List<Constructor<?>> mappingConstructors = Arrays.stream(constructors).filter(c -> c.getParameterCount() == payload.getConstructorSchema().getFieldsCount()).filter(c -> parametersCompatible(c.getParameters(), constructorRow)).collect(Collectors.toList());
    if (mappingConstructors.size() == 0) {
        throw new RuntimeException("Could not find a matching constructor. When using field names, make sure they are " + "available in the compiled Java class.");
    } else if (mappingConstructors.size() != 1) {
        throw new RuntimeException("Expected to find a single mapping constructor but found " + mappingConstructors.size());
    }
    return (Constructor<PTransform<InputT, OutputT>>) mappingConstructors.get(0);
}
Also used : JsonProperty(com.fasterxml.jackson.annotation.JsonProperty) BuilderMethod(org.apache.beam.model.pipeline.v1.ExternalTransforms.BuilderMethod) Arrays(java.util.Arrays) Array(java.lang.reflect.Array) NoSuchSchemaException(org.apache.beam.sdk.schemas.NoSuchSchemaException) SchemaApi(org.apache.beam.model.pipeline.v1.SchemaApi) RowCoder(org.apache.beam.sdk.coders.RowCoder) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) Constructor(java.lang.reflect.Constructor) ArrayList(java.util.ArrayList) PTransform(org.apache.beam.sdk.transforms.PTransform) FunctionSpec(org.apache.beam.model.pipeline.v1.RunnerApi.FunctionSpec) SchemaRegistry(org.apache.beam.sdk.schemas.SchemaRegistry) ByteString(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.ByteString) JavaFieldSchema(org.apache.beam.sdk.schemas.JavaFieldSchema) PInput(org.apache.beam.sdk.values.PInput) Row(org.apache.beam.sdk.values.Row) Method(java.lang.reflect.Method) Nullable(org.checkerframework.checker.nullness.qual.Nullable) InvalidProtocolBufferException(org.apache.beam.vendor.grpc.v1p43p2.com.google.protobuf.InvalidProtocolBufferException) Field(org.apache.beam.sdk.schemas.Schema.Field) TransformProvider(org.apache.beam.sdk.expansion.service.ExpansionService.TransformProvider) Collection(java.util.Collection) IOException(java.io.IOException) Collectors(java.util.stream.Collectors) Schema(org.apache.beam.sdk.schemas.Schema) TypeName(org.apache.beam.sdk.schemas.Schema.TypeName) JavaClassLookupPayload(org.apache.beam.model.pipeline.v1.ExternalTransforms.JavaClassLookupPayload) InvocationTargetException(java.lang.reflect.InvocationTargetException) ExpansionMethods(org.apache.beam.model.pipeline.v1.ExternalTransforms.ExpansionMethods) ClassUtils(org.apache.beam.repackaged.core.org.apache.commons.lang3.ClassUtils) POutput(org.apache.beam.sdk.values.POutput) List(java.util.List) ParameterizedType(java.lang.reflect.ParameterizedType) Type(java.lang.reflect.Type) ReflectHelpers(org.apache.beam.sdk.util.common.ReflectHelpers) JsonCreator(com.fasterxml.jackson.annotation.JsonCreator) AutoValue(com.google.auto.value.AutoValue) Annotation(java.lang.annotation.Annotation) Pattern(java.util.regex.Pattern) SchemaTranslation(org.apache.beam.sdk.schemas.SchemaTranslation) Collections(java.util.Collections) BeamUrns.getUrn(org.apache.beam.runners.core.construction.BeamUrns.getUrn) SuppressFBWarnings(edu.umd.cs.findbugs.annotations.SuppressFBWarnings) Constructor(java.lang.reflect.Constructor) Row(org.apache.beam.sdk.values.Row)

Example 8 with PTransform

use of org.apache.beam.sdk.transforms.PTransform in project beam by apache.

the class NexmarkUtils method prepareSideInput.

/**
 * Write data to be read as a side input.
 *
 * <p>Contains pairs of a number and its string representation to model lookups of some enrichment
 * data by id.
 *
 * <p>Generated data covers the range {@code [0, sideInputRowCount)} so lookup joins on any
 * desired id field can be modeled by looking up {@code id % sideInputRowCount}.
 */
public static PCollection<KV<Long, String>> prepareSideInput(Pipeline queryPipeline, NexmarkConfiguration config) {
    checkArgument(config.sideInputRowCount > 0, "Side input required but sideInputRowCount is not >0");
    PTransform<PBegin, PCollection<KV<Long, String>>> generateSideInputData = new GenerateSideInputData(config);
    switch(config.sideInputType) {
        case DIRECT:
            return queryPipeline.apply(generateSideInputData);
        case CSV:
            checkArgument(config.sideInputUrl != null, "Side input type %s requires a URL but sideInputUrl not specified", SideInputType.CSV.toString());
            checkArgument(config.sideInputNumShards > 0, "Side input type %s requires explicit numShards but sideInputNumShards not specified", SideInputType.CSV.toString());
            Pipeline tempPipeline = Pipeline.create();
            tempPipeline.apply(generateSideInputData).apply(MapElements.via(new SimpleFunction<KV<Long, String>, String>(kv -> String.format("%s,%s", kv.getKey(), kv.getValue())) {
            })).apply(TextIO.write().withNumShards(config.sideInputNumShards).to(config.sideInputUrl));
            tempPipeline.run().waitUntilFinish();
            return queryPipeline.apply(TextIO.read().from(config.sideInputUrl + "*")).apply(MapElements.via(new SimpleFunction<String, KV<Long, String>>(line -> {
                List<String> cols = ImmutableList.copyOf(Splitter.on(",").split(line));
                return KV.of(Long.valueOf(cols.get(0)), cols.get(1));
            }) {
            }));
        default:
            throw new IllegalArgumentException(String.format("Unknown type of side input requested: %s", config.sideInputType));
    }
}
Also used : StateSpec(org.apache.beam.sdk.state.StateSpec) SerializableCoder(org.apache.beam.sdk.coders.SerializableCoder) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) PBegin(org.apache.beam.sdk.values.PBegin) BidsPerSession(org.apache.beam.sdk.nexmark.model.BidsPerSession) Bid(org.apache.beam.sdk.nexmark.model.Bid) LoggerFactory(org.slf4j.LoggerFactory) ValueState(org.apache.beam.sdk.state.ValueState) AuctionBid(org.apache.beam.sdk.nexmark.model.AuctionBid) Auction(org.apache.beam.sdk.nexmark.model.Auction) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction) Metrics(org.apache.beam.sdk.metrics.Metrics) Generator(org.apache.beam.sdk.nexmark.sources.generator.Generator) Strings(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Strings) Create(org.apache.beam.sdk.transforms.Create) Window(org.apache.beam.sdk.transforms.windowing.Window) CategoryPrice(org.apache.beam.sdk.nexmark.model.CategoryPrice) TimestampedValue(org.apache.beam.sdk.values.TimestampedValue) CustomCoder(org.apache.beam.sdk.coders.CustomCoder) MapElements(org.apache.beam.sdk.transforms.MapElements) GenerateSequence(org.apache.beam.sdk.io.GenerateSequence) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) CoderException(org.apache.beam.sdk.coders.CoderException) List(java.util.List) ParDo(org.apache.beam.sdk.transforms.ParDo) AuctionCount(org.apache.beam.sdk.nexmark.model.AuctionCount) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) GlobalWindows(org.apache.beam.sdk.transforms.windowing.GlobalWindows) AuctionPrice(org.apache.beam.sdk.nexmark.model.AuctionPrice) AfterPane(org.apache.beam.sdk.transforms.windowing.AfterPane) JodaModule(com.fasterxml.jackson.datatype.joda.JodaModule) KV(org.apache.beam.sdk.values.KV) SellerPrice(org.apache.beam.sdk.nexmark.model.SellerPrice) GeneratorConfig(org.apache.beam.sdk.nexmark.sources.generator.GeneratorConfig) Combine(org.apache.beam.sdk.transforms.Combine) Duration(org.joda.time.Duration) Coder(org.apache.beam.sdk.coders.Coder) Splitter(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Splitter) Done(org.apache.beam.sdk.nexmark.model.Done) PTransform(org.apache.beam.sdk.transforms.PTransform) Read(org.apache.beam.sdk.io.Read) Event(org.apache.beam.sdk.nexmark.model.Event) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) FormatMethod(com.google.errorprone.annotations.FormatMethod) FormatString(com.google.errorprone.annotations.FormatString) IdNameReserve(org.apache.beam.sdk.nexmark.model.IdNameReserve) Pipeline(org.apache.beam.sdk.Pipeline) Hashing(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.hash.Hashing) Person(org.apache.beam.sdk.nexmark.model.Person) OutputStream(java.io.OutputStream) DoFn(org.apache.beam.sdk.transforms.DoFn) UnboundedEventSource(org.apache.beam.sdk.nexmark.sources.UnboundedEventSource) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) Counter(org.apache.beam.sdk.metrics.Counter) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) IOException(java.io.IOException) PCollection(org.apache.beam.sdk.values.PCollection) AvroCoder(org.apache.beam.sdk.coders.AvroCoder) StateSpecs(org.apache.beam.sdk.state.StateSpecs) ByteArrayCoder(org.apache.beam.sdk.coders.ByteArrayCoder) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) Instant(org.joda.time.Instant) KnownSize(org.apache.beam.sdk.nexmark.model.KnownSize) BoundedEventSource(org.apache.beam.sdk.nexmark.sources.BoundedEventSource) FileSystems(org.apache.beam.sdk.io.FileSystems) TextIO(org.apache.beam.sdk.io.TextIO) NameCityStateId(org.apache.beam.sdk.nexmark.model.NameCityStateId) InputStream(java.io.InputStream) FormatString(com.google.errorprone.annotations.FormatString) KV(org.apache.beam.sdk.values.KV) PBegin(org.apache.beam.sdk.values.PBegin) Pipeline(org.apache.beam.sdk.Pipeline) PCollection(org.apache.beam.sdk.values.PCollection) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction)

Example 9 with PTransform

use of org.apache.beam.sdk.transforms.PTransform in project beam by apache.

the class ProjectionPushdownOptimizer method optimize.

/**
 * Performs all known projection pushdown optimizations in-place on a Pipeline.
 *
 * <p>A pushdown optimization is possible wherever there is a {@link ProjectionProducer} that
 * produces a {@link PCollection} that is consumed by one or more PTransforms with an annotated
 * {@link FieldAccessDescriptor}, where the number of fields consumed is less than the number of
 * fields produced. The optimizer replaces the {@link ProjectionProducer} with the result of
 * calling {@link ProjectionProducer#actuateProjectionPushdown(Map)} on that producer with those
 * PCollections/fields.
 *
 * <p>Currently only supports pushdown on {@link ProjectionProducer} instances that are applied
 * directly to {@link PBegin} (https://issues.apache.org/jira/browse/BEAM-13658).
 */
public static void optimize(Pipeline pipeline) {
    // Compute which Schema fields are (or conversely, are not) accessed in a pipeline.
    FieldAccessVisitor fieldAccessVisitor = new FieldAccessVisitor();
    pipeline.traverseTopologically(fieldAccessVisitor);
    // Find transforms in this pipeline which both: 1. support projection pushdown and 2. output
    // unused fields.
    ProjectionProducerVisitor pushdownProjectorVisitor = new ProjectionProducerVisitor(fieldAccessVisitor.getPCollectionFieldAccess());
    pipeline.traverseTopologically(pushdownProjectorVisitor);
    Map<ProjectionProducer<PTransform<?, ?>>, Map<PCollection<?>, FieldAccessDescriptor>> pushdownOpportunities = pushdownProjectorVisitor.getPushdownOpportunities();
    // Translate target PCollections to their output TupleTags.
    PCollectionOutputTagVisitor outputTagVisitor = new PCollectionOutputTagVisitor(pushdownOpportunities);
    pipeline.traverseTopologically(outputTagVisitor);
    Map<ProjectionProducer<PTransform<?, ?>>, Map<TupleTag<?>, FieldAccessDescriptor>> taggedFieldAccess = outputTagVisitor.getTaggedFieldAccess();
    // fields.
    for (Entry<ProjectionProducer<PTransform<?, ?>>, Map<TupleTag<?>, FieldAccessDescriptor>> entry : taggedFieldAccess.entrySet()) {
        for (Entry<TupleTag<?>, FieldAccessDescriptor> outputFields : entry.getValue().entrySet()) {
            LOG.info("Optimizing transform {}: output {} will contain reduced field set {}", entry.getKey(), outputFields.getKey(), outputFields.getValue().fieldNamesAccessed());
        }
        PTransformMatcher matcher = application -> application.getTransform() == entry.getKey();
        PushdownOverrideFactory<?, ?> overrideFactory = new PushdownOverrideFactory<>(entry.getValue());
        pipeline.replaceAll(ImmutableList.of(PTransformOverride.of(matcher, overrideFactory)));
    }
}
Also used : Preconditions(org.apache.beam.sdk.util.Preconditions) PBegin(org.apache.beam.sdk.values.PBegin) Logger(org.slf4j.Logger) ProjectionProducer(org.apache.beam.sdk.schemas.ProjectionProducer) LoggerFactory(org.slf4j.LoggerFactory) PTransformOverride(org.apache.beam.sdk.runners.PTransformOverride) PCollection(org.apache.beam.sdk.values.PCollection) Collectors(java.util.stream.Collectors) PTransform(org.apache.beam.sdk.transforms.PTransform) POutput(org.apache.beam.sdk.values.POutput) PTransformOverrideFactory(org.apache.beam.sdk.runners.PTransformOverrideFactory) TupleTag(org.apache.beam.sdk.values.TupleTag) Map(java.util.Map) FieldAccessDescriptor(org.apache.beam.sdk.schemas.FieldAccessDescriptor) Iterables(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.Iterables) Entry(java.util.Map.Entry) TaggedPValue(org.apache.beam.sdk.values.TaggedPValue) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) Pipeline(org.apache.beam.sdk.Pipeline) PTransformMatcher(org.apache.beam.sdk.runners.PTransformMatcher) SimpleEntry(java.util.AbstractMap.SimpleEntry) AppliedPTransform(org.apache.beam.sdk.runners.AppliedPTransform) FieldAccessDescriptor(org.apache.beam.sdk.schemas.FieldAccessDescriptor) PTransformMatcher(org.apache.beam.sdk.runners.PTransformMatcher) TupleTag(org.apache.beam.sdk.values.TupleTag) ProjectionProducer(org.apache.beam.sdk.schemas.ProjectionProducer) Map(java.util.Map)

Example 10 with PTransform

use of org.apache.beam.sdk.transforms.PTransform in project beam by apache.

the class PTransformMatchersTest method classEqualToDoesNotMatchSubclass.

@Test
public void classEqualToDoesNotMatchSubclass() {
    class MyPTransform extends PTransform<PCollection<KV<String, Integer>>, PCollection<Integer>> {

        @Override
        public PCollection<Integer> expand(PCollection<KV<String, Integer>> input) {
            return PCollection.createPrimitiveOutputInternal(input.getPipeline(), input.getWindowingStrategy(), input.isBounded(), VarIntCoder.of());
        }
    }
    PTransformMatcher matcher = PTransformMatchers.classEqualTo(MyPTransform.class);
    MyPTransform subclass = new MyPTransform() {
    };
    assertThat(subclass.getClass(), not(Matchers.<Class<?>>equalTo(MyPTransform.class)));
    assertThat(subclass, instanceOf(MyPTransform.class));
    AppliedPTransform<?, ?, ?> application = getAppliedTransform(subclass);
    assertThat(matcher.matches(application), is(false));
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) PTransformMatcher(org.apache.beam.sdk.runners.PTransformMatcher) KV(org.apache.beam.sdk.values.KV) PTransform(org.apache.beam.sdk.transforms.PTransform) AppliedPTransform(org.apache.beam.sdk.runners.AppliedPTransform) Test(org.junit.Test)

Aggregations

PTransform (org.apache.beam.sdk.transforms.PTransform)41 PCollection (org.apache.beam.sdk.values.PCollection)29 Test (org.junit.Test)18 AppliedPTransform (org.apache.beam.sdk.runners.AppliedPTransform)11 PBegin (org.apache.beam.sdk.values.PBegin)11 IOException (java.io.IOException)10 ArrayList (java.util.ArrayList)10 List (java.util.List)10 Map (java.util.Map)10 TupleTag (org.apache.beam.sdk.values.TupleTag)10 DoFn (org.apache.beam.sdk.transforms.DoFn)9 Coder (org.apache.beam.sdk.coders.Coder)8 Create (org.apache.beam.sdk.transforms.Create)8 ParDo (org.apache.beam.sdk.transforms.ParDo)7 PDone (org.apache.beam.sdk.values.PDone)7 PCollectionTuple (org.apache.beam.sdk.values.PCollectionTuple)6 Collection (java.util.Collection)5 HashMap (java.util.HashMap)5 Collectors.toList (java.util.stream.Collectors.toList)5 Schema (org.apache.beam.sdk.schemas.Schema)5