Search in sources :

Example 11 with PCollection

use of org.apache.beam.sdk.values.PCollection in project components by Talend.

the class PubSubOutputRuntimeTestIT method outputAvro.

private void outputAvro(Pipeline pipeline) throws IOException {
    String testID = "avroBasicTest" + new Random().nextInt();
    List<Person> expectedPersons = Person.genRandomList(testID, maxRecords);
    List<String> expectedMessages = new ArrayList<>();
    List<IndexedRecord> sendMessages = new ArrayList<>();
    for (Person person : expectedPersons) {
        expectedMessages.add(person.toAvroRecord().toString());
        sendMessages.add(person.toAvroRecord());
    }
    PubSubOutputRuntime outputRuntime = new PubSubOutputRuntime();
    outputRuntime.initialize(runtimeContainer, createOutput(createDatasetFromAvro(createDatastore(), topicName, Person.schema.toString())));
    PCollection<IndexedRecord> output = (PCollection<IndexedRecord>) pipeline.apply(Create.of(sendMessages));
    output.apply(outputRuntime);
    pipeline.run().waitUntilFinish();
    List<String> actual = new ArrayList<>();
    while (true) {
        List<ReceivedMessage> messages = client.pull(subscriptionName, maxRecords);
        List<String> ackIds = new ArrayList<>();
        for (ReceivedMessage message : messages) {
            actual.add(Person.desFromAvroBytes(message.getMessage().decodeData()).toAvroRecord().toString());
            ackIds.add(message.getAckId());
        }
        client.ack(subscriptionName, ackIds);
        if (actual.size() >= maxRecords) {
            break;
        }
    }
    assertThat(actual, containsInAnyOrder(expectedMessages.toArray()));
}
Also used : ConvertToIndexedRecord(org.talend.components.adapter.beam.transform.ConvertToIndexedRecord) IndexedRecord(org.apache.avro.generic.IndexedRecord) ArrayList(java.util.ArrayList) ReceivedMessage(com.google.api.services.pubsub.model.ReceivedMessage) PCollection(org.apache.beam.sdk.values.PCollection) Random(java.util.Random)

Example 12 with PCollection

use of org.apache.beam.sdk.values.PCollection in project components by Talend.

the class PubSubOutputRuntimeTestIT method outputCsv.

private void outputCsv(Pipeline pipeline) throws IOException {
    String testID = "csvBasicTest" + new Random().nextInt();
    final String fieldDelimited = ";";
    List<Person> expectedPersons = Person.genRandomList(testID, maxRecords);
    List<String> expectedMessages = new ArrayList<>();
    List<String[]> sendMessages = new ArrayList<>();
    for (Person person : expectedPersons) {
        expectedMessages.add(person.toCSV(fieldDelimited));
        sendMessages.add(person.toCSV(fieldDelimited).split(fieldDelimited));
    }
    PubSubOutputRuntime outputRuntime = new PubSubOutputRuntime();
    outputRuntime.initialize(runtimeContainer, createOutput(createDatasetFromCSV(createDatastore(), topicName, fieldDelimited)));
    PCollection<IndexedRecord> records = (PCollection<IndexedRecord>) pipeline.apply(Create.of(sendMessages)).apply((PTransform) ConvertToIndexedRecord.of());
    records.setCoder(LazyAvroCoder.of()).apply(outputRuntime);
    pipeline.run().waitUntilFinish();
    List<String> actual = new ArrayList<>();
    while (true) {
        List<ReceivedMessage> messages = client.pull(subscriptionName, maxRecords);
        List<String> ackIds = new ArrayList<>();
        for (ReceivedMessage message : messages) {
            actual.add(new String(message.getMessage().decodeData()));
            ackIds.add(message.getAckId());
        }
        client.ack(subscriptionName, ackIds);
        if (actual.size() >= maxRecords) {
            break;
        }
    }
    assertThat(actual, containsInAnyOrder(expectedMessages.toArray()));
}
Also used : ConvertToIndexedRecord(org.talend.components.adapter.beam.transform.ConvertToIndexedRecord) IndexedRecord(org.apache.avro.generic.IndexedRecord) ArrayList(java.util.ArrayList) ReceivedMessage(com.google.api.services.pubsub.model.ReceivedMessage) PCollection(org.apache.beam.sdk.values.PCollection) Random(java.util.Random) PTransform(org.apache.beam.sdk.transforms.PTransform)

Example 13 with PCollection

use of org.apache.beam.sdk.values.PCollection in project components by Talend.

the class WindowRuntimeTest method testFixedWindow.

@Test
public void testFixedWindow() {
    PipelineOptions options = PipelineOptionsFactory.create();
    options.setRunner(DirectRunner.class);
    final Pipeline p = Pipeline.create(options);
    // creation of PCollection with different timestamp PCollection<IndexedRecord>
    List<TimestampedValue<IndexedRecord>> data = Arrays.asList(TimestampedValue.of(irA, new Instant(1L)), TimestampedValue.of(irB, new Instant(2L)), TimestampedValue.of(irC, new Instant(3L)));
    PCollection<IndexedRecord> input = (PCollection<IndexedRecord>) p.apply(Create.timestamped(data).withCoder(LazyAvroCoder.of()));
    WindowProperties windowProperties = new WindowProperties("window");
    windowProperties.windowLength.setValue(2);
    windowProperties.windowSlideLength.setValue(-1);
    windowProperties.windowSession.setValue(false);
    windowProperties.setValue("windowLength", 2);
    windowProperties.setValue("windowSlideLength", -1);
    windowProperties.setValue("windowSession", false);
    WindowRuntime windowRun = new WindowRuntime();
    windowRun.initialize(null, windowProperties);
    PCollection<IndexedRecord> test = windowRun.expand(input);
    PCollection<KV<IndexedRecord, Long>> windowed_counts = test.apply(Count.<IndexedRecord>perElement());
    // ///////
    // Fixed duration: 2
    PAssert.that(windowed_counts).containsInAnyOrder(KV.of(irA, 1L), KV.of(irB, 1L), KV.of(irC, 1L));
    p.run();
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) WindowProperties(org.talend.components.processing.definition.window.WindowProperties) WindowRuntime(org.talend.components.processing.runtime.window.WindowRuntime) TimestampedValue(org.apache.beam.sdk.values.TimestampedValue) IndexedRecord(org.apache.avro.generic.IndexedRecord) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 14 with PCollection

use of org.apache.beam.sdk.values.PCollection in project components by Talend.

the class NormalizeRuntime method expand.

@Override
public PCollection expand(PCollection<IndexedRecord> inputPCollection) {
    NormalizeDoFn doFn = // 
    new NormalizeDoFn().withProperties(properties);
    PCollection outputCollection = inputPCollection.apply(ParDo.of(doFn));
    return outputCollection;
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection)

Example 15 with PCollection

use of org.apache.beam.sdk.values.PCollection in project components by Talend.

the class FilterRowRuntime method build.

@Override
public void build(BeamJobContext ctx) {
    String mainLink = ctx.getLinkNameByPortName("input_" + properties.MAIN_CONNECTOR.getName());
    if (!StringUtils.isEmpty(mainLink)) {
        PCollection<IndexedRecord> mainPCollection = ctx.getPCollectionByLinkName(mainLink);
        if (mainPCollection != null) {
            String flowLink = ctx.getLinkNameByPortName("output_" + properties.FLOW_CONNECTOR.getName());
            String rejectLink = ctx.getLinkNameByPortName("output_" + properties.REJECT_CONNECTOR.getName());
            boolean hasFlow = !StringUtils.isEmpty(flowLink);
            boolean hasReject = !StringUtils.isEmpty(rejectLink);
            if (hasFlow && hasReject) {
                // If both of the outputs are present, the DoFn must be used.
                PCollectionTuple outputTuples = mainPCollection.apply(ctx.getPTransformName(), ParDo.of(new FilterRowDoFn(properties)).withOutputTags(flowOutput, TupleTagList.of(rejectOutput)));
                ctx.putPCollectionByLinkName(flowLink, outputTuples.get(flowOutput));
                ctx.putPCollectionByLinkName(rejectLink, outputTuples.get(rejectOutput));
            } else if (hasFlow || hasReject) {
                // If only one of the outputs is present, the predicate can be used for efficiency.
                FilterRowPredicate predicate = // 
                hasFlow ? // 
                new FilterRowPredicate(properties) : new FilterRowPredicate.Negate(properties);
                PCollection<IndexedRecord> output = mainPCollection.apply(ctx.getPTransformName(), Filter.by(predicate));
                ctx.putPCollectionByLinkName(hasFlow ? flowLink : rejectLink, output);
            } else {
            // If neither are specified, then don't do anything. This component could have been cut from the pipeline.
            }
        }
    }
}
Also used : PCollection(org.apache.beam.sdk.values.PCollection) IndexedRecord(org.apache.avro.generic.IndexedRecord) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple)

Aggregations

PCollection (org.apache.beam.sdk.values.PCollection)198 Test (org.junit.Test)133 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)61 KV (org.apache.beam.sdk.values.KV)61 Map (java.util.Map)59 List (java.util.List)58 Rule (org.junit.Rule)57 RunWith (org.junit.runner.RunWith)54 PAssert (org.apache.beam.sdk.testing.PAssert)52 Instant (org.joda.time.Instant)46 Duration (org.joda.time.Duration)45 JUnit4 (org.junit.runners.JUnit4)45 ParDo (org.apache.beam.sdk.transforms.ParDo)44 TupleTag (org.apache.beam.sdk.values.TupleTag)42 Pipeline (org.apache.beam.sdk.Pipeline)41 Create (org.apache.beam.sdk.transforms.Create)41 ArrayList (java.util.ArrayList)40 Serializable (java.io.Serializable)39 PTransform (org.apache.beam.sdk.transforms.PTransform)37 Row (org.apache.beam.sdk.values.Row)37