use of org.apache.beam.sdk.transforms.PTransform in project beam by apache.
the class TransformHierarchyTest method visitAfterReplace.
/**
* Tests that visiting the {@link TransformHierarchy} after replacing nodes does not visit any of
* the original nodes or inaccessible values but does visit all of the replacement nodes, new
* inaccessible replacement values, and the original output values.
*/
@Test
public void visitAfterReplace() {
Node root = hierarchy.getCurrent();
final SingleOutput<Long, Long> originalParDo = ParDo.of(new DoFn<Long, Long>() {
@ProcessElement
public void processElement(ProcessContext ctxt) {
ctxt.output(ctxt.element() + 1L);
}
});
GenerateSequence genUpstream = GenerateSequence.from(0);
PCollection<Long> upstream = pipeline.apply(genUpstream);
PCollection<Long> output = upstream.apply("Original", originalParDo);
Node upstreamNode = hierarchy.pushNode("Upstream", pipeline.begin(), genUpstream);
hierarchy.finishSpecifyingInput();
hierarchy.setOutput(upstream);
hierarchy.popNode();
Node original = hierarchy.pushNode("Original", upstream, originalParDo);
hierarchy.finishSpecifyingInput();
hierarchy.setOutput(output);
hierarchy.popNode();
final TupleTag<Long> longs = new TupleTag<>();
final MultiOutput<Long, Long> replacementParDo = ParDo.of(new DoFn<Long, Long>() {
@ProcessElement
public void processElement(ProcessContext ctxt) {
ctxt.output(ctxt.element() + 1L);
}
}).withOutputTags(longs, TupleTagList.empty());
PTransform<PCollection<Long>, PCollection<Long>> replacementComposite = new PTransform<PCollection<Long>, PCollection<Long>>() {
@Override
public PCollection<Long> expand(PCollection<Long> input) {
return input.apply("Contained", replacementParDo).get(longs);
}
};
PCollectionTuple replacementOutput = upstream.apply("Contained", replacementParDo);
Node compositeNode = hierarchy.replaceNode(original, upstream, replacementComposite);
Node replacementParNode = hierarchy.pushNode("Original/Contained", upstream, replacementParDo);
hierarchy.finishSpecifyingInput();
hierarchy.setOutput(replacementOutput);
hierarchy.popNode();
hierarchy.setOutput(replacementOutput.get(longs));
Map<TupleTag<?>, PCollection<?>> expandedReplacementOutput = (Map) replacementOutput.expand();
Entry<TupleTag<?>, PCollection<?>> replacementLongs = Iterables.getOnlyElement(expandedReplacementOutput.entrySet());
hierarchy.replaceOutputs(Collections.singletonMap(replacementOutput.get(longs), ReplacementOutput.of(TaggedPValue.ofExpandedValue(output), TaggedPValue.of(replacementLongs.getKey(), replacementLongs.getValue()))));
hierarchy.popNode();
final Set<Node> visitedCompositeNodes = new HashSet<>();
final Set<Node> visitedPrimitiveNodes = new HashSet<>();
Set<PValue> visitedValues = hierarchy.visit(new Defaults() {
@Override
public CompositeBehavior enterCompositeTransform(Node node) {
visitedCompositeNodes.add(node);
return CompositeBehavior.ENTER_TRANSFORM;
}
@Override
public void visitPrimitiveTransform(Node node) {
visitedPrimitiveNodes.add(node);
}
});
/*
Final Graph:
Upstream -> Upstream.out -> Composite -> (ReplacementParDo -> OriginalParDo.out)
*/
assertThat(visitedCompositeNodes, containsInAnyOrder(root, compositeNode));
assertThat(visitedPrimitiveNodes, containsInAnyOrder(upstreamNode, replacementParNode));
assertThat(visitedValues, containsInAnyOrder(upstream, output));
}
use of org.apache.beam.sdk.transforms.PTransform in project beam by apache.
the class JavaClassLookupTransformProvider method findMappingConstructor.
private Constructor<PTransform<InputT, OutputT>> findMappingConstructor(Constructor<?>[] constructors, JavaClassLookupPayload payload) {
Row constructorRow = decodeRow(payload.getConstructorSchema(), payload.getConstructorPayload());
List<Constructor<?>> mappingConstructors = Arrays.stream(constructors).filter(c -> c.getParameterCount() == payload.getConstructorSchema().getFieldsCount()).filter(c -> parametersCompatible(c.getParameters(), constructorRow)).collect(Collectors.toList());
if (mappingConstructors.size() == 0) {
throw new RuntimeException("Could not find a matching constructor. When using field names, make sure they are " + "available in the compiled Java class.");
} else if (mappingConstructors.size() != 1) {
throw new RuntimeException("Expected to find a single mapping constructor but found " + mappingConstructors.size());
}
return (Constructor<PTransform<InputT, OutputT>>) mappingConstructors.get(0);
}
use of org.apache.beam.sdk.transforms.PTransform in project beam by apache.
the class NexmarkUtils method prepareSideInput.
/**
* Write data to be read as a side input.
*
* <p>Contains pairs of a number and its string representation to model lookups of some enrichment
* data by id.
*
* <p>Generated data covers the range {@code [0, sideInputRowCount)} so lookup joins on any
* desired id field can be modeled by looking up {@code id % sideInputRowCount}.
*/
public static PCollection<KV<Long, String>> prepareSideInput(Pipeline queryPipeline, NexmarkConfiguration config) {
checkArgument(config.sideInputRowCount > 0, "Side input required but sideInputRowCount is not >0");
PTransform<PBegin, PCollection<KV<Long, String>>> generateSideInputData = new GenerateSideInputData(config);
switch(config.sideInputType) {
case DIRECT:
return queryPipeline.apply(generateSideInputData);
case CSV:
checkArgument(config.sideInputUrl != null, "Side input type %s requires a URL but sideInputUrl not specified", SideInputType.CSV.toString());
checkArgument(config.sideInputNumShards > 0, "Side input type %s requires explicit numShards but sideInputNumShards not specified", SideInputType.CSV.toString());
Pipeline tempPipeline = Pipeline.create();
tempPipeline.apply(generateSideInputData).apply(MapElements.via(new SimpleFunction<KV<Long, String>, String>(kv -> String.format("%s,%s", kv.getKey(), kv.getValue())) {
})).apply(TextIO.write().withNumShards(config.sideInputNumShards).to(config.sideInputUrl));
tempPipeline.run().waitUntilFinish();
return queryPipeline.apply(TextIO.read().from(config.sideInputUrl + "*")).apply(MapElements.via(new SimpleFunction<String, KV<Long, String>>(line -> {
List<String> cols = ImmutableList.copyOf(Splitter.on(",").split(line));
return KV.of(Long.valueOf(cols.get(0)), cols.get(1));
}) {
}));
default:
throw new IllegalArgumentException(String.format("Unknown type of side input requested: %s", config.sideInputType));
}
}
use of org.apache.beam.sdk.transforms.PTransform in project beam by apache.
the class ProjectionPushdownOptimizer method optimize.
/**
* Performs all known projection pushdown optimizations in-place on a Pipeline.
*
* <p>A pushdown optimization is possible wherever there is a {@link ProjectionProducer} that
* produces a {@link PCollection} that is consumed by one or more PTransforms with an annotated
* {@link FieldAccessDescriptor}, where the number of fields consumed is less than the number of
* fields produced. The optimizer replaces the {@link ProjectionProducer} with the result of
* calling {@link ProjectionProducer#actuateProjectionPushdown(Map)} on that producer with those
* PCollections/fields.
*
* <p>Currently only supports pushdown on {@link ProjectionProducer} instances that are applied
* directly to {@link PBegin} (https://issues.apache.org/jira/browse/BEAM-13658).
*/
public static void optimize(Pipeline pipeline) {
// Compute which Schema fields are (or conversely, are not) accessed in a pipeline.
FieldAccessVisitor fieldAccessVisitor = new FieldAccessVisitor();
pipeline.traverseTopologically(fieldAccessVisitor);
// Find transforms in this pipeline which both: 1. support projection pushdown and 2. output
// unused fields.
ProjectionProducerVisitor pushdownProjectorVisitor = new ProjectionProducerVisitor(fieldAccessVisitor.getPCollectionFieldAccess());
pipeline.traverseTopologically(pushdownProjectorVisitor);
Map<ProjectionProducer<PTransform<?, ?>>, Map<PCollection<?>, FieldAccessDescriptor>> pushdownOpportunities = pushdownProjectorVisitor.getPushdownOpportunities();
// Translate target PCollections to their output TupleTags.
PCollectionOutputTagVisitor outputTagVisitor = new PCollectionOutputTagVisitor(pushdownOpportunities);
pipeline.traverseTopologically(outputTagVisitor);
Map<ProjectionProducer<PTransform<?, ?>>, Map<TupleTag<?>, FieldAccessDescriptor>> taggedFieldAccess = outputTagVisitor.getTaggedFieldAccess();
// fields.
for (Entry<ProjectionProducer<PTransform<?, ?>>, Map<TupleTag<?>, FieldAccessDescriptor>> entry : taggedFieldAccess.entrySet()) {
for (Entry<TupleTag<?>, FieldAccessDescriptor> outputFields : entry.getValue().entrySet()) {
LOG.info("Optimizing transform {}: output {} will contain reduced field set {}", entry.getKey(), outputFields.getKey(), outputFields.getValue().fieldNamesAccessed());
}
PTransformMatcher matcher = application -> application.getTransform() == entry.getKey();
PushdownOverrideFactory<?, ?> overrideFactory = new PushdownOverrideFactory<>(entry.getValue());
pipeline.replaceAll(ImmutableList.of(PTransformOverride.of(matcher, overrideFactory)));
}
}
use of org.apache.beam.sdk.transforms.PTransform in project beam by apache.
the class PTransformMatchersTest method classEqualToDoesNotMatchSubclass.
@Test
public void classEqualToDoesNotMatchSubclass() {
class MyPTransform extends PTransform<PCollection<KV<String, Integer>>, PCollection<Integer>> {
@Override
public PCollection<Integer> expand(PCollection<KV<String, Integer>> input) {
return PCollection.createPrimitiveOutputInternal(input.getPipeline(), input.getWindowingStrategy(), input.isBounded(), VarIntCoder.of());
}
}
PTransformMatcher matcher = PTransformMatchers.classEqualTo(MyPTransform.class);
MyPTransform subclass = new MyPTransform() {
};
assertThat(subclass.getClass(), not(Matchers.<Class<?>>equalTo(MyPTransform.class)));
assertThat(subclass, instanceOf(MyPTransform.class));
AppliedPTransform<?, ?, ?> application = getAppliedTransform(subclass);
assertThat(matcher.matches(application), is(false));
}
Aggregations