Examples with PCollection - org.apache.beam.sdk.values.PCollection

Example 6 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class ParDoTranslator method translate.

@Override
public void translate(ParDo.MultiOutput<InputT, OutputT> transform, TranslationContext context) {
    DoFn<InputT, OutputT> doFn = transform.getFn();
    DoFnSignature signature = DoFnSignatures.getSignature(doFn.getClass());
    if (signature.processElement().isSplittable()) {
        throw new UnsupportedOperationException(String.format("%s does not support splittable DoFn: %s", ApexRunner.class.getSimpleName(), doFn));
    }
    if (signature.stateDeclarations().size() > 0) {
        throw new UnsupportedOperationException(String.format("Found %s annotations on %s, but %s cannot yet be used with state in the %s.", DoFn.StateId.class.getSimpleName(), doFn.getClass().getName(), DoFn.class.getSimpleName(), ApexRunner.class.getSimpleName()));
    }
    if (signature.timerDeclarations().size() > 0) {
        throw new UnsupportedOperationException(String.format("Found %s annotations on %s, but %s cannot yet be used with timers in the %s.", DoFn.TimerId.class.getSimpleName(), doFn.getClass().getName(), DoFn.class.getSimpleName(), ApexRunner.class.getSimpleName()));
    }
    Map<TupleTag<?>, PValue> outputs = context.getOutputs();
    PCollection<InputT> input = context.getInput();
    List<PCollectionView<?>> sideInputs = transform.getSideInputs();
    Coder<InputT> inputCoder = input.getCoder();
    WindowedValueCoder<InputT> wvInputCoder = FullWindowedValueCoder.of(inputCoder, input.getWindowingStrategy().getWindowFn().windowCoder());
    ApexParDoOperator<InputT, OutputT> operator = new ApexParDoOperator<>(context.getPipelineOptions(), doFn, transform.getMainOutputTag(), transform.getAdditionalOutputTags().getAll(), input.getWindowingStrategy(), sideInputs, wvInputCoder, context.getStateBackend());
    Map<PCollection<?>, OutputPort<?>> ports = Maps.newHashMapWithExpectedSize(outputs.size());
    for (Entry<TupleTag<?>, PValue> output : outputs.entrySet()) {
        checkArgument(output.getValue() instanceof PCollection, "%s %s outputs non-PCollection %s of type %s", ParDo.MultiOutput.class.getSimpleName(), context.getFullName(), output.getValue(), output.getValue().getClass().getSimpleName());
        PCollection<?> pc = (PCollection<?>) output.getValue();
        if (output.getKey().equals(transform.getMainOutputTag())) {
            ports.put(pc, operator.output);
        } else {
            int portIndex = 0;
            for (TupleTag<?> tag : transform.getAdditionalOutputTags().getAll()) {
                if (tag.equals(output.getKey())) {
                    ports.put(pc, operator.additionalOutputPorts[portIndex]);
                    break;
                }
                portIndex++;
            }
        }
    }
    context.addOperator(operator, ports);
    context.addStream(context.getInput(), operator.input);
    if (!sideInputs.isEmpty()) {
        addSideInputs(operator.sideInput1, sideInputs, context);
    }
}

Also used : OutputPort(com.datatorrent.api.Operator.OutputPort) TupleTag(org.apache.beam.sdk.values.TupleTag) ApexParDoOperator(org.apache.beam.runners.apex.translation.operators.ApexParDoOperator) PValue(org.apache.beam.sdk.values.PValue) PCollection(org.apache.beam.sdk.values.PCollection) PCollectionView(org.apache.beam.sdk.values.PCollectionView) DoFn(org.apache.beam.sdk.transforms.DoFn) DoFnSignature(org.apache.beam.sdk.transforms.reflect.DoFnSignature)

Example 7 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class TranslationContext method populateDAG.

public void populateDAG(DAG dag) {
    for (Map.Entry<String, Operator> nameAndOperator : this.operators.entrySet()) {
        dag.addOperator(nameAndOperator.getKey(), nameAndOperator.getValue());
    }
    int streamIndex = 0;
    for (Map.Entry<PCollection, Pair<OutputPortInfo, List<InputPortInfo>>> streamEntry : this.streams.entrySet()) {
        List<InputPortInfo> destInfo = streamEntry.getValue().getRight();
        InputPort[] sinks = new InputPort[destInfo.size()];
        for (int i = 0; i < sinks.length; i++) {
            sinks[i] = destInfo.get(i).port;
        }
        if (sinks.length > 0) {
            DAG.StreamMeta streamMeta = dag.addStream("stream" + streamIndex++, streamEntry.getValue().getLeft().port, sinks);
            if (pipelineOptions.isParDoFusionEnabled()) {
                optimizeStreams(streamMeta, streamEntry);
            }
            for (InputPort port : sinks) {
                PCollection pc = streamEntry.getKey();
                Coder coder = pc.getCoder();
                if (pc.getWindowingStrategy() != null) {
                    coder = FullWindowedValueCoder.of(pc.getCoder(), pc.getWindowingStrategy().getWindowFn().windowCoder());
                }
                Coder<Object> wrapperCoder = ApexStreamTuple.ApexStreamTupleCoder.of(coder);
                CoderAdapterStreamCodec streamCodec = new CoderAdapterStreamCodec(wrapperCoder);
                dag.setInputPortAttribute(port, PortContext.STREAM_CODEC, streamCodec);
            }
        }
    }
}

Also used : Operator(com.datatorrent.api.Operator) Coder(org.apache.beam.sdk.coders.Coder) FullWindowedValueCoder(org.apache.beam.sdk.util.WindowedValue.FullWindowedValueCoder) InputPort(com.datatorrent.api.Operator.InputPort) DAG(com.datatorrent.api.DAG) CoderAdapterStreamCodec(org.apache.beam.runners.apex.translation.utils.CoderAdapterStreamCodec) PCollection(org.apache.beam.sdk.values.PCollection) HashMap(java.util.HashMap) Map(java.util.Map) Pair(org.apache.commons.lang3.tuple.Pair) ImmutablePair(org.apache.commons.lang3.tuple.ImmutablePair)

Example 8 with PCollection

use of org.apache.beam.sdk.values.PCollection in project beam by apache.

the class FlattenPCollectionTranslatorTest method test.

@Test
public void test() throws Exception {
    ApexPipelineOptions options = PipelineOptionsFactory.as(ApexPipelineOptions.class);
    options.setApplicationName("FlattenPCollection");
    options.setRunner(ApexRunner.class);
    Pipeline p = Pipeline.create(options);
    String[][] collections = { { "1" }, { "2" }, { "3" }, { "4" }, { "5" } };
    Set<String> expected = Sets.newHashSet();
    List<PCollection<String>> pcList = new ArrayList<PCollection<String>>();
    for (String[] collection : collections) {
        pcList.add(p.apply(Create.of(ImmutableList.copyOf(collection)).withCoder(StringUtf8Coder.of())));
        expected.addAll(Arrays.asList(collection));
    }
    PCollection<String> actual = PCollectionList.of(pcList).apply(Flatten.<String>pCollections());
    actual.apply(ParDo.of(new EmbeddedCollector()));
    ApexRunnerResult result = (ApexRunnerResult) p.run();
    // TODO: verify translation
    result.getApexDAG();
    long timeout = System.currentTimeMillis() + 30000;
    while (System.currentTimeMillis() < timeout && EmbeddedCollector.RESULTS.size() < expected.size()) {
        LOG.info("Waiting for expected results.");
        Thread.sleep(500);
    }
    Assert.assertEquals("number results", expected.size(), EmbeddedCollector.RESULTS.size());
    Assert.assertEquals(expected, Sets.newHashSet(EmbeddedCollector.RESULTS));
}

Also used : ArrayList(java.util.ArrayList) ApexRunnerResult(org.apache.beam.runners.apex.ApexRunnerResult) Pipeline(org.apache.beam.sdk.Pipeline) PCollection(org.apache.beam.sdk.values.PCollection) ApexPipelineOptions(org.apache.beam.runners.apex.ApexPipelineOptions) Test(org.junit.Test)

Example 9 with PCollection

use of org.apache.beam.sdk.values.PCollection in project java-docs-samples by GoogleCloudPlatform.

the class SpannerReadAll method main.

public static void main(String[] args) {
    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    Pipeline p = Pipeline.create(options);
    SpannerConfig spannerConfig = SpannerConfig.create().withInstanceId(options.getInstanceId()).withDatabaseId(options.getDatabaseId());
    // [START spanner_dataflow_readall]
    PCollection<Struct> allRecords = p.apply(SpannerIO.read().withSpannerConfig(spannerConfig).withQuery("SELECT t.table_name FROM information_schema.tables AS t WHERE t" + ".table_catalog = '' AND t.table_schema = ''")).apply(MapElements.into(TypeDescriptor.of(ReadOperation.class)).via((SerializableFunction<Struct, ReadOperation>) input -> {
        String tableName = input.getString(0);
        return ReadOperation.create().withQuery("SELECT * FROM " + tableName);
    })).apply(SpannerIO.readAll().withSpannerConfig(spannerConfig));
    // [END spanner_dataflow_readall]
    PCollection<Long> dbEstimatedSize = allRecords.apply(EstimateSize.create()).apply(Sum.longsGlobally());
    dbEstimatedSize.apply(ToString.elements()).apply(TextIO.write().to(options.getOutput()).withoutSharding());
    p.run().waitUntilFinish();
}

Also used : SpannerConfig(org.apache.beam.sdk.io.gcp.spanner.SpannerConfig) MapElements(org.apache.beam.sdk.transforms.MapElements) ToString(org.apache.beam.sdk.transforms.ToString) TypeDescriptor(org.apache.beam.sdk.values.TypeDescriptor) Sum(org.apache.beam.sdk.transforms.Sum) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) PCollection(org.apache.beam.sdk.values.PCollection) SpannerIO(org.apache.beam.sdk.io.gcp.spanner.SpannerIO) SpannerConfig(org.apache.beam.sdk.io.gcp.spanner.SpannerConfig) Description(org.apache.beam.sdk.options.Description) ReadOperation(org.apache.beam.sdk.io.gcp.spanner.ReadOperation) Struct(com.google.cloud.spanner.Struct) Validation(org.apache.beam.sdk.options.Validation) Pipeline(org.apache.beam.sdk.Pipeline) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) TextIO(org.apache.beam.sdk.io.TextIO) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ReadOperation(org.apache.beam.sdk.io.gcp.spanner.ReadOperation) ToString(org.apache.beam.sdk.transforms.ToString) Pipeline(org.apache.beam.sdk.Pipeline) Struct(com.google.cloud.spanner.Struct)

Example 10 with PCollection

use of org.apache.beam.sdk.values.PCollection in project components by Talend.

the class SimpleFileIORoundTripRuntimeTest method runRoundTripPipelines.

/**
 * Tests a round-trip on the data when writing to the data source using the given output properties, then
 * subsequently reading using the given input properties. This is the equivalent of two pipeline jobs.
 *
 * @param initialData The initial data set to write, then read.
 * @param outputProps The properties used to create the output runtime.
 * @param inputProps The properties used to create the input runtime.
 * @return The data returned from the round-trip.
 */
protected static List<IndexedRecord> runRoundTripPipelines(BeamDirectTestResource beam, List<IndexedRecord> initialData, SimpleFileIOOutputProperties outputProps, SimpleFileIOInputProperties inputProps) {
    // Create the runtimes.
    SimpleFileIOOutputRuntime outputRuntime = new SimpleFileIOOutputRuntime();
    outputRuntime.initialize(null, outputProps);
    SimpleFileIOInputRuntime inputRuntime = new SimpleFileIOInputRuntime();
    inputRuntime.initialize(null, inputProps);
    // Use the runtime in a direct pipeline to test.
    PipelineOptions options = PipelineOptionsFactory.create();
    options.setRunner(DirectRunner.class);
    // Create a pipeline to write the records to the output.
    {
        final Pipeline p = beam.createPipeline();
        PCollection<IndexedRecord> input = p.apply(Create.<IndexedRecord>of(initialData));
        input.apply(outputRuntime);
        p.run().waitUntilFinish();
    }
    // Read the records that were written.
    try (DirectCollector<IndexedRecord> collector = DirectCollector.of()) {
        final Pipeline p = beam.createPipeline();
        PCollection<IndexedRecord> input = p.apply(inputRuntime);
        input.apply(collector);
        p.run().waitUntilFinish();
        // Return the list of records from the round trip.
        return collector.getRecords();
    }
}

Also used : PCollection(org.apache.beam.sdk.values.PCollection) ConvertToIndexedRecord(org.talend.components.adapter.beam.transform.ConvertToIndexedRecord) IndexedRecord(org.apache.avro.generic.IndexedRecord) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Pipeline(org.apache.beam.sdk.Pipeline)

Aggregations

PCollection (org.apache.beam.sdk.values.PCollection)198 Test (org.junit.Test)133 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)61 KV (org.apache.beam.sdk.values.KV)61 Map (java.util.Map)59 List (java.util.List)58 Rule (org.junit.Rule)57 RunWith (org.junit.runner.RunWith)54 PAssert (org.apache.beam.sdk.testing.PAssert)52 Instant (org.joda.time.Instant)46 Duration (org.joda.time.Duration)45 JUnit4 (org.junit.runners.JUnit4)45 ParDo (org.apache.beam.sdk.transforms.ParDo)44 TupleTag (org.apache.beam.sdk.values.TupleTag)42 Pipeline (org.apache.beam.sdk.Pipeline)41 Create (org.apache.beam.sdk.transforms.Create)41 ArrayList (java.util.ArrayList)40 Serializable (java.io.Serializable)39 PTransform (org.apache.beam.sdk.transforms.PTransform)37 Row (org.apache.beam.sdk.values.Row)37