Search in sources :

Example 1 with Read

use of org.apache.beam.sdk.io.Read in project beam by apache.

the class DataflowRunner method getOverrides.

private List<PTransformOverride> getOverrides(boolean streaming) {
    ImmutableList.Builder<PTransformOverride> overridesBuilder = ImmutableList.builder();
    // Create is implemented in terms of a Read, so it must precede the override to Read in
    // streaming
    overridesBuilder.add(PTransformOverride.of(PTransformMatchers.flattenWithDuplicateInputs(), DeduplicatedFlattenFactory.create())).add(PTransformOverride.of(PTransformMatchers.emptyFlatten(), EmptyFlattenAsCreateFactory.instance()));
    if (streaming) {
        if (!hasExperiment(options, "enable_custom_pubsub_source")) {
            overridesBuilder.add(PTransformOverride.of(PTransformMatchers.classEqualTo(PubsubUnboundedSource.class), new ReflectiveRootOverrideFactory(StreamingPubsubIORead.class, this)));
        }
        if (!hasExperiment(options, "enable_custom_pubsub_sink")) {
            overridesBuilder.add(PTransformOverride.of(PTransformMatchers.classEqualTo(PubsubUnboundedSink.class), new StreamingPubsubIOWriteOverrideFactory(this)));
        }
        overridesBuilder.add(// must precede it
        PTransformOverride.of(PTransformMatchers.classEqualTo(Read.Bounded.class), new ReflectiveRootOverrideFactory(StreamingBoundedRead.class, this))).add(PTransformOverride.of(PTransformMatchers.classEqualTo(Read.Unbounded.class), new ReflectiveRootOverrideFactory(StreamingUnboundedRead.class, this))).add(PTransformOverride.of(PTransformMatchers.classEqualTo(View.CreatePCollectionView.class), new StreamingCreatePCollectionViewFactory()));
    } else {
        overridesBuilder.add(PTransformOverride.of(PTransformMatchers.stateOrTimerParDoMulti(), BatchStatefulParDoOverrides.multiOutputOverrideFactory())).add(PTransformOverride.of(PTransformMatchers.stateOrTimerParDoSingle(), BatchStatefulParDoOverrides.singleOutputOverrideFactory())).add(PTransformOverride.of(PTransformMatchers.classEqualTo(Combine.GloballyAsSingletonView.class), new BatchCombineGloballyAsSingletonViewFactory(this))).add(PTransformOverride.of(PTransformMatchers.classEqualTo(View.AsMap.class), new ReflectiveOneToOneOverrideFactory(BatchViewOverrides.BatchViewAsMap.class, this))).add(PTransformOverride.of(PTransformMatchers.classEqualTo(View.AsMultimap.class), new ReflectiveOneToOneOverrideFactory(BatchViewOverrides.BatchViewAsMultimap.class, this))).add(PTransformOverride.of(PTransformMatchers.classEqualTo(View.AsSingleton.class), new ReflectiveOneToOneOverrideFactory(BatchViewOverrides.BatchViewAsSingleton.class, this))).add(PTransformOverride.of(PTransformMatchers.classEqualTo(View.AsList.class), new ReflectiveOneToOneOverrideFactory(BatchViewOverrides.BatchViewAsList.class, this))).add(PTransformOverride.of(PTransformMatchers.classEqualTo(View.AsIterable.class), new ReflectiveOneToOneOverrideFactory(BatchViewOverrides.BatchViewAsIterable.class, this)));
    }
    overridesBuilder.add(PTransformOverride.of(PTransformMatchers.classEqualTo(Reshuffle.class), new ReshuffleOverrideFactory())).add(PTransformOverride.of(PTransformMatchers.classEqualTo(Combine.GroupedValues.class), new PrimitiveCombineGroupedValuesOverrideFactory())).add(PTransformOverride.of(PTransformMatchers.classEqualTo(ParDo.SingleOutput.class), new PrimitiveParDoSingleFactory()));
    return overridesBuilder.build();
}
Also used : BatchCombineGloballyAsSingletonViewFactory(org.apache.beam.runners.dataflow.BatchViewOverrides.BatchCombineGloballyAsSingletonViewFactory) ImmutableList(com.google.common.collect.ImmutableList) PTransformOverride(org.apache.beam.sdk.runners.PTransformOverride) View(org.apache.beam.sdk.transforms.View) PCollectionView(org.apache.beam.sdk.values.PCollectionView) GroupedValues(org.apache.beam.sdk.transforms.Combine.GroupedValues) StreamingCreatePCollectionViewFactory(org.apache.beam.runners.dataflow.StreamingViewOverrides.StreamingCreatePCollectionViewFactory) Read(org.apache.beam.sdk.io.Read) ParDo(org.apache.beam.sdk.transforms.ParDo)

Example 2 with Read

use of org.apache.beam.sdk.io.Read in project beam by apache.

the class NexmarkUtils method prepareSideInput.

/**
 * Write data to be read as a side input.
 *
 * <p>Contains pairs of a number and its string representation to model lookups of some enrichment
 * data by id.
 *
 * <p>Generated data covers the range {@code [0, sideInputRowCount)} so lookup joins on any
 * desired id field can be modeled by looking up {@code id % sideInputRowCount}.
 */
public static PCollection<KV<Long, String>> prepareSideInput(Pipeline queryPipeline, NexmarkConfiguration config) {
    checkArgument(config.sideInputRowCount > 0, "Side input required but sideInputRowCount is not >0");
    PTransform<PBegin, PCollection<KV<Long, String>>> generateSideInputData = new GenerateSideInputData(config);
    switch(config.sideInputType) {
        case DIRECT:
            return queryPipeline.apply(generateSideInputData);
        case CSV:
            checkArgument(config.sideInputUrl != null, "Side input type %s requires a URL but sideInputUrl not specified", SideInputType.CSV.toString());
            checkArgument(config.sideInputNumShards > 0, "Side input type %s requires explicit numShards but sideInputNumShards not specified", SideInputType.CSV.toString());
            Pipeline tempPipeline = Pipeline.create();
            tempPipeline.apply(generateSideInputData).apply(MapElements.via(new SimpleFunction<KV<Long, String>, String>(kv -> String.format("%s,%s", kv.getKey(), kv.getValue())) {
            })).apply(TextIO.write().withNumShards(config.sideInputNumShards).to(config.sideInputUrl));
            tempPipeline.run().waitUntilFinish();
            return queryPipeline.apply(TextIO.read().from(config.sideInputUrl + "*")).apply(MapElements.via(new SimpleFunction<String, KV<Long, String>>(line -> {
                List<String> cols = ImmutableList.copyOf(Splitter.on(",").split(line));
                return KV.of(Long.valueOf(cols.get(0)), cols.get(1));
            }) {
            }));
        default:
            throw new IllegalArgumentException(String.format("Unknown type of side input requested: %s", config.sideInputType));
    }
}
Also used : StateSpec(org.apache.beam.sdk.state.StateSpec) SerializableCoder(org.apache.beam.sdk.coders.SerializableCoder) CoderRegistry(org.apache.beam.sdk.coders.CoderRegistry) PBegin(org.apache.beam.sdk.values.PBegin) BidsPerSession(org.apache.beam.sdk.nexmark.model.BidsPerSession) Bid(org.apache.beam.sdk.nexmark.model.Bid) LoggerFactory(org.slf4j.LoggerFactory) ValueState(org.apache.beam.sdk.state.ValueState) AuctionBid(org.apache.beam.sdk.nexmark.model.AuctionBid) Auction(org.apache.beam.sdk.nexmark.model.Auction) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction) Metrics(org.apache.beam.sdk.metrics.Metrics) Generator(org.apache.beam.sdk.nexmark.sources.generator.Generator) Strings(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Strings) Create(org.apache.beam.sdk.transforms.Create) Window(org.apache.beam.sdk.transforms.windowing.Window) CategoryPrice(org.apache.beam.sdk.nexmark.model.CategoryPrice) TimestampedValue(org.apache.beam.sdk.values.TimestampedValue) CustomCoder(org.apache.beam.sdk.coders.CustomCoder) MapElements(org.apache.beam.sdk.transforms.MapElements) GenerateSequence(org.apache.beam.sdk.io.GenerateSequence) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) CoderException(org.apache.beam.sdk.coders.CoderException) List(java.util.List) ParDo(org.apache.beam.sdk.transforms.ParDo) AuctionCount(org.apache.beam.sdk.nexmark.model.AuctionCount) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) GlobalWindows(org.apache.beam.sdk.transforms.windowing.GlobalWindows) AuctionPrice(org.apache.beam.sdk.nexmark.model.AuctionPrice) AfterPane(org.apache.beam.sdk.transforms.windowing.AfterPane) JodaModule(com.fasterxml.jackson.datatype.joda.JodaModule) KV(org.apache.beam.sdk.values.KV) SellerPrice(org.apache.beam.sdk.nexmark.model.SellerPrice) GeneratorConfig(org.apache.beam.sdk.nexmark.sources.generator.GeneratorConfig) Combine(org.apache.beam.sdk.transforms.Combine) Duration(org.joda.time.Duration) Coder(org.apache.beam.sdk.coders.Coder) Splitter(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Splitter) Done(org.apache.beam.sdk.nexmark.model.Done) PTransform(org.apache.beam.sdk.transforms.PTransform) Read(org.apache.beam.sdk.io.Read) Event(org.apache.beam.sdk.nexmark.model.Event) Preconditions.checkArgument(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Preconditions.checkArgument) FormatMethod(com.google.errorprone.annotations.FormatMethod) FormatString(com.google.errorprone.annotations.FormatString) IdNameReserve(org.apache.beam.sdk.nexmark.model.IdNameReserve) Pipeline(org.apache.beam.sdk.Pipeline) Hashing(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.hash.Hashing) Person(org.apache.beam.sdk.nexmark.model.Person) OutputStream(java.io.OutputStream) DoFn(org.apache.beam.sdk.transforms.DoFn) UnboundedEventSource(org.apache.beam.sdk.nexmark.sources.UnboundedEventSource) Logger(org.slf4j.Logger) Iterator(java.util.Iterator) Counter(org.apache.beam.sdk.metrics.Counter) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) IOException(java.io.IOException) PCollection(org.apache.beam.sdk.values.PCollection) AvroCoder(org.apache.beam.sdk.coders.AvroCoder) StateSpecs(org.apache.beam.sdk.state.StateSpecs) ByteArrayCoder(org.apache.beam.sdk.coders.ByteArrayCoder) BoundedWindow(org.apache.beam.sdk.transforms.windowing.BoundedWindow) Instant(org.joda.time.Instant) KnownSize(org.apache.beam.sdk.nexmark.model.KnownSize) BoundedEventSource(org.apache.beam.sdk.nexmark.sources.BoundedEventSource) FileSystems(org.apache.beam.sdk.io.FileSystems) TextIO(org.apache.beam.sdk.io.TextIO) NameCityStateId(org.apache.beam.sdk.nexmark.model.NameCityStateId) InputStream(java.io.InputStream) FormatString(com.google.errorprone.annotations.FormatString) KV(org.apache.beam.sdk.values.KV) PBegin(org.apache.beam.sdk.values.PBegin) Pipeline(org.apache.beam.sdk.Pipeline) PCollection(org.apache.beam.sdk.values.PCollection) SimpleFunction(org.apache.beam.sdk.transforms.SimpleFunction)

Example 3 with Read

use of org.apache.beam.sdk.io.Read in project beam by apache.

the class TransformHierarchyTest method visitVisitsAllPushed.

@Test
public void visitVisitsAllPushed() {
    TransformHierarchy.Node root = hierarchy.getCurrent();
    PBegin begin = PBegin.in(pipeline);
    Create.Values<Long> create = Create.of(1L);
    Read.Bounded<Long> read = Read.from(CountingSource.upTo(1L));
    PCollection<Long> created = PCollection.createPrimitiveOutputInternal(pipeline, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarLongCoder.of());
    SingleOutput<Long, Long> pardo = ParDo.of(new DoFn<Long, Long>() {

        @ProcessElement
        public void processElement(ProcessContext ctxt) {
            ctxt.output(ctxt.element());
        }
    });
    PCollection<Long> mapped = PCollection.createPrimitiveOutputInternal(pipeline, WindowingStrategy.globalDefault(), IsBounded.BOUNDED, VarLongCoder.of());
    TransformHierarchy.Node compositeNode = hierarchy.pushNode("Create", begin, create);
    hierarchy.finishSpecifyingInput();
    assertThat(hierarchy.getCurrent(), equalTo(compositeNode));
    assertThat(compositeNode.getInputs().entrySet(), Matchers.empty());
    assertThat(compositeNode.getTransform(), equalTo(create));
    // Not yet set
    assertThat(compositeNode.getOutputs().entrySet(), Matchers.emptyIterable());
    assertThat(compositeNode.getEnclosingNode().isRootNode(), is(true));
    TransformHierarchy.Node primitiveNode = hierarchy.pushNode("Create/Read", begin, read);
    assertThat(hierarchy.getCurrent(), equalTo(primitiveNode));
    hierarchy.finishSpecifyingInput();
    hierarchy.setOutput(created);
    hierarchy.popNode();
    assertThat(primitiveNode.getOutputs().values(), containsInAnyOrder(created));
    assertThat(primitiveNode.getInputs().entrySet(), Matchers.emptyIterable());
    assertThat(primitiveNode.getTransform(), equalTo(read));
    assertThat(primitiveNode.getEnclosingNode(), equalTo(compositeNode));
    hierarchy.setOutput(created);
    // The composite is listed as outputting a PValue created by the contained primitive
    assertThat(compositeNode.getOutputs().values(), containsInAnyOrder(created));
    // The producer of that PValue is still the primitive in which it is first output
    assertThat(hierarchy.getProducer(created), equalTo(primitiveNode));
    hierarchy.popNode();
    TransformHierarchy.Node otherPrimitive = hierarchy.pushNode("ParDo", created, pardo);
    hierarchy.finishSpecifyingInput();
    hierarchy.setOutput(mapped);
    hierarchy.popNode();
    final Set<TransformHierarchy.Node> visitedCompositeNodes = new HashSet<>();
    final Set<TransformHierarchy.Node> visitedPrimitiveNodes = new HashSet<>();
    final Set<PValue> visitedValuesInVisitor = new HashSet<>();
    Set<PValue> visitedValues = hierarchy.visit(new PipelineVisitor.Defaults() {

        @Override
        public CompositeBehavior enterCompositeTransform(TransformHierarchy.Node node) {
            visitedCompositeNodes.add(node);
            return CompositeBehavior.ENTER_TRANSFORM;
        }

        @Override
        public void visitPrimitiveTransform(TransformHierarchy.Node node) {
            visitedPrimitiveNodes.add(node);
        }

        @Override
        public void visitValue(PValue value, TransformHierarchy.Node producer) {
            visitedValuesInVisitor.add(value);
        }
    });
    assertThat(visitedCompositeNodes, containsInAnyOrder(root, compositeNode));
    assertThat(visitedPrimitiveNodes, containsInAnyOrder(primitiveNode, otherPrimitive));
    assertThat(visitedValuesInVisitor, containsInAnyOrder(created, mapped));
    assertThat(visitedValuesInVisitor, equalTo(visitedValues));
}
Also used : Node(org.apache.beam.sdk.runners.TransformHierarchy.Node) Node(org.apache.beam.sdk.runners.TransformHierarchy.Node) Defaults(org.apache.beam.sdk.Pipeline.PipelineVisitor.Defaults) PBegin(org.apache.beam.sdk.values.PBegin) TaggedPValue(org.apache.beam.sdk.values.TaggedPValue) PValue(org.apache.beam.sdk.values.PValue) Read(org.apache.beam.sdk.io.Read) Create(org.apache.beam.sdk.transforms.Create) PipelineVisitor(org.apache.beam.sdk.Pipeline.PipelineVisitor) HashSet(java.util.HashSet) Test(org.junit.Test)

Aggregations

Read (org.apache.beam.sdk.io.Read)3 ParDo (org.apache.beam.sdk.transforms.ParDo)2 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)1 JodaModule (com.fasterxml.jackson.datatype.joda.JodaModule)1 ImmutableList (com.google.common.collect.ImmutableList)1 FormatMethod (com.google.errorprone.annotations.FormatMethod)1 FormatString (com.google.errorprone.annotations.FormatString)1 IOException (java.io.IOException)1 InputStream (java.io.InputStream)1 OutputStream (java.io.OutputStream)1 StandardCharsets (java.nio.charset.StandardCharsets)1 HashSet (java.util.HashSet)1 Iterator (java.util.Iterator)1 List (java.util.List)1 Collectors (java.util.stream.Collectors)1 BatchCombineGloballyAsSingletonViewFactory (org.apache.beam.runners.dataflow.BatchViewOverrides.BatchCombineGloballyAsSingletonViewFactory)1 StreamingCreatePCollectionViewFactory (org.apache.beam.runners.dataflow.StreamingViewOverrides.StreamingCreatePCollectionViewFactory)1 Pipeline (org.apache.beam.sdk.Pipeline)1 PipelineVisitor (org.apache.beam.sdk.Pipeline.PipelineVisitor)1 Defaults (org.apache.beam.sdk.Pipeline.PipelineVisitor.Defaults)1