use of org.apache.beam.sdk.values.PCollection in project components by Talend.
the class PubSubOutputRuntimeTestIT method outputAvro.
private void outputAvro(Pipeline pipeline) throws IOException {
String testID = "avroBasicTest" + new Random().nextInt();
List<Person> expectedPersons = Person.genRandomList(testID, maxRecords);
List<String> expectedMessages = new ArrayList<>();
List<IndexedRecord> sendMessages = new ArrayList<>();
for (Person person : expectedPersons) {
expectedMessages.add(person.toAvroRecord().toString());
sendMessages.add(person.toAvroRecord());
}
PubSubOutputRuntime outputRuntime = new PubSubOutputRuntime();
outputRuntime.initialize(runtimeContainer, createOutput(createDatasetFromAvro(createDatastore(), topicName, Person.schema.toString())));
PCollection<IndexedRecord> output = (PCollection<IndexedRecord>) pipeline.apply(Create.of(sendMessages));
output.apply(outputRuntime);
pipeline.run().waitUntilFinish();
List<String> actual = new ArrayList<>();
while (true) {
List<ReceivedMessage> messages = client.pull(subscriptionName, maxRecords);
List<String> ackIds = new ArrayList<>();
for (ReceivedMessage message : messages) {
actual.add(Person.desFromAvroBytes(message.getMessage().decodeData()).toAvroRecord().toString());
ackIds.add(message.getAckId());
}
client.ack(subscriptionName, ackIds);
if (actual.size() >= maxRecords) {
break;
}
}
assertThat(actual, containsInAnyOrder(expectedMessages.toArray()));
}
use of org.apache.beam.sdk.values.PCollection in project components by Talend.
the class PubSubOutputRuntimeTestIT method outputCsv.
private void outputCsv(Pipeline pipeline) throws IOException {
String testID = "csvBasicTest" + new Random().nextInt();
final String fieldDelimited = ";";
List<Person> expectedPersons = Person.genRandomList(testID, maxRecords);
List<String> expectedMessages = new ArrayList<>();
List<String[]> sendMessages = new ArrayList<>();
for (Person person : expectedPersons) {
expectedMessages.add(person.toCSV(fieldDelimited));
sendMessages.add(person.toCSV(fieldDelimited).split(fieldDelimited));
}
PubSubOutputRuntime outputRuntime = new PubSubOutputRuntime();
outputRuntime.initialize(runtimeContainer, createOutput(createDatasetFromCSV(createDatastore(), topicName, fieldDelimited)));
PCollection<IndexedRecord> records = (PCollection<IndexedRecord>) pipeline.apply(Create.of(sendMessages)).apply((PTransform) ConvertToIndexedRecord.of());
records.setCoder(LazyAvroCoder.of()).apply(outputRuntime);
pipeline.run().waitUntilFinish();
List<String> actual = new ArrayList<>();
while (true) {
List<ReceivedMessage> messages = client.pull(subscriptionName, maxRecords);
List<String> ackIds = new ArrayList<>();
for (ReceivedMessage message : messages) {
actual.add(new String(message.getMessage().decodeData()));
ackIds.add(message.getAckId());
}
client.ack(subscriptionName, ackIds);
if (actual.size() >= maxRecords) {
break;
}
}
assertThat(actual, containsInAnyOrder(expectedMessages.toArray()));
}
use of org.apache.beam.sdk.values.PCollection in project components by Talend.
the class WindowRuntimeTest method testFixedWindow.
@Test
public void testFixedWindow() {
PipelineOptions options = PipelineOptionsFactory.create();
options.setRunner(DirectRunner.class);
final Pipeline p = Pipeline.create(options);
// creation of PCollection with different timestamp PCollection<IndexedRecord>
List<TimestampedValue<IndexedRecord>> data = Arrays.asList(TimestampedValue.of(irA, new Instant(1L)), TimestampedValue.of(irB, new Instant(2L)), TimestampedValue.of(irC, new Instant(3L)));
PCollection<IndexedRecord> input = (PCollection<IndexedRecord>) p.apply(Create.timestamped(data).withCoder(LazyAvroCoder.of()));
WindowProperties windowProperties = new WindowProperties("window");
windowProperties.windowLength.setValue(2);
windowProperties.windowSlideLength.setValue(-1);
windowProperties.windowSession.setValue(false);
windowProperties.setValue("windowLength", 2);
windowProperties.setValue("windowSlideLength", -1);
windowProperties.setValue("windowSession", false);
WindowRuntime windowRun = new WindowRuntime();
windowRun.initialize(null, windowProperties);
PCollection<IndexedRecord> test = windowRun.expand(input);
PCollection<KV<IndexedRecord, Long>> windowed_counts = test.apply(Count.<IndexedRecord>perElement());
// ///////
// Fixed duration: 2
PAssert.that(windowed_counts).containsInAnyOrder(KV.of(irA, 1L), KV.of(irB, 1L), KV.of(irC, 1L));
p.run();
}
use of org.apache.beam.sdk.values.PCollection in project components by Talend.
the class NormalizeRuntime method expand.
@Override
public PCollection expand(PCollection<IndexedRecord> inputPCollection) {
NormalizeDoFn doFn = //
new NormalizeDoFn().withProperties(properties);
PCollection outputCollection = inputPCollection.apply(ParDo.of(doFn));
return outputCollection;
}
use of org.apache.beam.sdk.values.PCollection in project components by Talend.
the class FilterRowRuntime method build.
@Override
public void build(BeamJobContext ctx) {
String mainLink = ctx.getLinkNameByPortName("input_" + properties.MAIN_CONNECTOR.getName());
if (!StringUtils.isEmpty(mainLink)) {
PCollection<IndexedRecord> mainPCollection = ctx.getPCollectionByLinkName(mainLink);
if (mainPCollection != null) {
String flowLink = ctx.getLinkNameByPortName("output_" + properties.FLOW_CONNECTOR.getName());
String rejectLink = ctx.getLinkNameByPortName("output_" + properties.REJECT_CONNECTOR.getName());
boolean hasFlow = !StringUtils.isEmpty(flowLink);
boolean hasReject = !StringUtils.isEmpty(rejectLink);
if (hasFlow && hasReject) {
// If both of the outputs are present, the DoFn must be used.
PCollectionTuple outputTuples = mainPCollection.apply(ctx.getPTransformName(), ParDo.of(new FilterRowDoFn(properties)).withOutputTags(flowOutput, TupleTagList.of(rejectOutput)));
ctx.putPCollectionByLinkName(flowLink, outputTuples.get(flowOutput));
ctx.putPCollectionByLinkName(rejectLink, outputTuples.get(rejectOutput));
} else if (hasFlow || hasReject) {
// If only one of the outputs is present, the predicate can be used for efficiency.
FilterRowPredicate predicate = //
hasFlow ? //
new FilterRowPredicate(properties) : new FilterRowPredicate.Negate(properties);
PCollection<IndexedRecord> output = mainPCollection.apply(ctx.getPTransformName(), Filter.by(predicate));
ctx.putPCollectionByLinkName(hasFlow ? flowLink : rejectLink, output);
} else {
// If neither are specified, then don't do anything. This component could have been cut from the pipeline.
}
}
}
}
Aggregations