use of org.apache.beam.sdk.io.Read in project beam by apache.
the class DataflowRunner method getOverrides.
private List<PTransformOverride> getOverrides(boolean streaming) {
ImmutableList.Builder<PTransformOverride> overridesBuilder = ImmutableList.builder();
// Create is implemented in terms of a Read, so it must precede the override to Read in
// streaming
overridesBuilder.add(PTransformOverride.of(PTransformMatchers.flattenWithDuplicateInputs(), DeduplicatedFlattenFactory.create())).add(PTransformOverride.of(PTransformMatchers.emptyFlatten(), EmptyFlattenAsCreateFactory.instance()));
if (streaming) {
if (!hasExperiment(options, "enable_custom_pubsub_source")) {
overridesBuilder.add(PTransformOverride.of(PTransformMatchers.classEqualTo(PubsubUnboundedSource.class), new ReflectiveRootOverrideFactory(StreamingPubsubIORead.class, this)));
}
if (!hasExperiment(options, "enable_custom_pubsub_sink")) {
overridesBuilder.add(PTransformOverride.of(PTransformMatchers.classEqualTo(PubsubUnboundedSink.class), new StreamingPubsubIOWriteOverrideFactory(this)));
}
overridesBuilder.add(// must precede it
PTransformOverride.of(PTransformMatchers.classEqualTo(Read.Bounded.class), new ReflectiveRootOverrideFactory(StreamingBoundedRead.class, this))).add(PTransformOverride.of(PTransformMatchers.classEqualTo(Read.Unbounded.class), new ReflectiveRootOverrideFactory(StreamingUnboundedRead.class, this))).add(PTransformOverride.of(PTransformMatchers.classEqualTo(View.CreatePCollectionView.class), new StreamingCreatePCollectionViewFactory()));
} else {
overridesBuilder.add(PTransformOverride.of(PTransformMatchers.stateOrTimerParDoMulti(), BatchStatefulParDoOverrides.multiOutputOverrideFactory())).add(PTransformOverride.of(PTransformMatchers.stateOrTimerParDoSingle(), BatchStatefulParDoOverrides.singleOutputOverrideFactory())).add(PTransformOverride.of(PTransformMatchers.classEqualTo(Combine.GloballyAsSingletonView.class), new BatchCombineGloballyAsSingletonViewFactory(this))).add(PTransformOverride.of(PTransformMatchers.classEqualTo(View.AsMap.class), new ReflectiveOneToOneOverrideFactory(BatchViewOverrides.BatchViewAsMap.class, this))).add(PTransformOverride.of(PTransformMatchers.classEqualTo(View.AsMultimap.class), new ReflectiveOneToOneOverrideFactory(BatchViewOverrides.BatchViewAsMultimap.class, this))).add(PTransformOverride.of(PTransformMatchers.classEqualTo(View.AsSingleton.class), new ReflectiveOneToOneOverrideFactory(BatchViewOverrides.BatchViewAsSingleton.class, this))).add(PTransformOverride.of(PTransformMatchers.classEqualTo(View.AsList.class), new ReflectiveOneToOneOverrideFactory(BatchViewOverrides.BatchViewAsList.class, this))).add(PTransformOverride.of(PTransformMatchers.classEqualTo(View.AsIterable.class), new ReflectiveOneToOneOverrideFactory(BatchViewOverrides.BatchViewAsIterable.class, this)));
}
overridesBuilder.add(PTransformOverride.of(PTransformMatchers.classEqualTo(Reshuffle.class), new ReshuffleOverrideFactory())).add(PTransformOverride.of(PTransformMatchers.classEqualTo(Combine.GroupedValues.class), new PrimitiveCombineGroupedValuesOverrideFactory())).add(PTransformOverride.of(PTransformMatchers.classEqualTo(ParDo.SingleOutput.class), new PrimitiveParDoSingleFactory()));
return overridesBuilder.build();
}
use of org.apache.beam.sdk.io.Read in project beam by apache.
the class NexmarkUtils method prepareSideInput.
/**
* Write data to be read as a side input.
*
* <p>Contains pairs of a number and its string representation to model lookups of some enrichment
* data by id.
*
* <p>Generated data covers the range {@code [0, sideInputRowCount)} so lookup joins on any
* desired id field can be modeled by looking up {@code id % sideInputRowCount}.
*/
public static PCollection<KV<Long, String>> prepareSideInput(Pipeline queryPipeline, NexmarkConfiguration config) {
checkArgument(config.sideInputRowCount > 0, "Side input required but sideInputRowCount is not >0");
PTransform<PBegin, PCollection<KV<Long, String>>> generateSideInputData = new GenerateSideInputData(config);
switch(config.sideInputType) {
case DIRECT:
return queryPipeline.apply(generateSideInputData);
case CSV:
checkArgument(config.sideInputUrl != null, "Side input type %s requires a URL but sideInputUrl not specified", SideInputType.CSV.toString());
checkArgument(config.sideInputNumShards > 0, "Side input type %s requires explicit numShards but sideInputNumShards not specified", SideInputType.CSV.toString());
Pipeline tempPipeline = Pipeline.create();
tempPipeline.apply(generateSideInputData).apply(MapElements.via(new SimpleFunction<KV<Long, String>, String>(kv -> String.format("%s,%s", kv.getKey(), kv.getValue())) {
})).apply(TextIO.write().withNumShards(config.sideInputNumShards).to(config.sideInputUrl));
tempPipeline.run().waitUntilFinish();
return queryPipeline.apply(TextIO.read().from(config.sideInputUrl + "*")).apply(MapElements.via(new SimpleFunction<String, KV<Long, String>>(line -> {
List<String> cols = ImmutableList.copyOf(Splitter.on(",").split(line));
return KV.of(Long.valueOf(cols.get(0)), cols.get(1));
}) {
}));
default:
throw new IllegalArgumentException(String.format("Unknown type of side input requested: %s", config.sideInputType));
}
}
use of org.apache.beam.sdk.io.Read in project beam by apache.
the class TransformHierarchyTest method visitVisitsAllPushed.
@Test
public void visitVisitsAllPushed() {
TransformHierarchy.Node root = hierarchy.getCurrent();
PBegin begin = PBegin.in(pipeline);
Create.Values<Long> create = Create.of(1L);
Read.Bounded<Long> read = Read.from(CountingSource.upTo(1L));
PCollection<Long> created = PCollection.createPrimitiveOutputInternal(pipeline, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarLongCoder.of());
SingleOutput<Long, Long> pardo = ParDo.of(new DoFn<Long, Long>() {
@ProcessElement
public void processElement(ProcessContext ctxt) {
ctxt.output(ctxt.element());
}
});
PCollection<Long> mapped = PCollection.createPrimitiveOutputInternal(pipeline, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarLongCoder.of());
TransformHierarchy.Node compositeNode = hierarchy.pushNode("Create", begin, create);
hierarchy.finishSpecifyingInput();
assertThat(hierarchy.getCurrent(), equalTo(compositeNode));
assertThat(compositeNode.getInputs().entrySet(), Matchers.empty());
assertThat(compositeNode.getTransform(), equalTo(create));
// Not yet set
assertThat(compositeNode.getOutputs().entrySet(), Matchers.emptyIterable());
assertThat(compositeNode.getEnclosingNode().isRootNode(), is(true));
TransformHierarchy.Node primitiveNode = hierarchy.pushNode("Create/Read", begin, read);
assertThat(hierarchy.getCurrent(), equalTo(primitiveNode));
hierarchy.finishSpecifyingInput();
hierarchy.setOutput(created);
hierarchy.popNode();
assertThat(primitiveNode.getOutputs().values(), containsInAnyOrder(created));
assertThat(primitiveNode.getInputs().entrySet(), Matchers.emptyIterable());
assertThat(primitiveNode.getTransform(), equalTo(read));
assertThat(primitiveNode.getEnclosingNode(), equalTo(compositeNode));
hierarchy.setOutput(created);
// The composite is listed as outputting a PValue created by the contained primitive
assertThat(compositeNode.getOutputs().values(), containsInAnyOrder(created));
// The producer of that PValue is still the primitive in which it is first output
assertThat(hierarchy.getProducer(created), equalTo(primitiveNode));
hierarchy.popNode();
TransformHierarchy.Node otherPrimitive = hierarchy.pushNode("ParDo", created, pardo);
hierarchy.finishSpecifyingInput();
hierarchy.setOutput(mapped);
hierarchy.popNode();
final Set<TransformHierarchy.Node> visitedCompositeNodes = new HashSet<>();
final Set<TransformHierarchy.Node> visitedPrimitiveNodes = new HashSet<>();
final Set<PValue> visitedValuesInVisitor = new HashSet<>();
Set<PValue> visitedValues = hierarchy.visit(new PipelineVisitor.Defaults() {
@Override
public CompositeBehavior enterCompositeTransform(TransformHierarchy.Node node) {
visitedCompositeNodes.add(node);
return CompositeBehavior.ENTER_TRANSFORM;
}
@Override
public void visitPrimitiveTransform(TransformHierarchy.Node node) {
visitedPrimitiveNodes.add(node);
}
@Override
public void visitValue(PValue value, TransformHierarchy.Node producer) {
visitedValuesInVisitor.add(value);
}
});
assertThat(visitedCompositeNodes, containsInAnyOrder(root, compositeNode));
assertThat(visitedPrimitiveNodes, containsInAnyOrder(primitiveNode, otherPrimitive));
assertThat(visitedValuesInVisitor, containsInAnyOrder(created, mapped));
assertThat(visitedValuesInVisitor, equalTo(visitedValues));
}
Aggregations