use of org.apache.beam.sdk.PipelineResult in project components by Talend.
the class KafkaAvroBeamRuntimeTestIT method avroBasicTest.
/**
* Read avro(Person) format and write avro(Person) format with schema.
*/
@Test
public void avroBasicTest() throws IOException {
String testID = "avroBasicTest" + new Random().nextInt();
expectedPersons = Person.genRandomList(testID, maxRecords);
// ----------------- Send data to TOPIC_AVRO_IN start --------------------
Properties props = new Properties();
props.put("bootstrap.servers", BOOTSTRAP_HOST);
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
props.put("value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer");
Producer<Void, byte[]> producer = new KafkaProducer<>(props);
for (Person person : expectedPersons) {
ProducerRecord<Void, byte[]> message = new ProducerRecord<>(TOPIC_AVRO_IN, person.serToAvroBytes());
producer.send(message);
}
producer.close();
// ----------------- Send data to TOPIC_AVRO_IN done --------------------
KafkaInputProperties inputProperties = new KafkaInputProperties("input");
inputProperties.init();
inputProperties.setDatasetProperties(inputDatasetProperties);
inputProperties.autoOffsetReset.setValue(KafkaInputProperties.OffsetType.EARLIEST);
inputProperties.useMaxNumRecords.setValue(false);
// inputProperties.maxNumRecords.setValue(maxRecords.longValue());
inputProperties.useMaxReadTime.setValue(true);
inputProperties.maxReadTime.setValue(5000l);
KafkaOutputProperties outputProperties = new KafkaOutputProperties("output");
outputProperties.init();
outputProperties.setDatasetProperties(outputDatasetProperties);
outputProperties.partitionType.setValue(KafkaOutputProperties.PartitionType.ROUND_ROBIN);
outputProperties.useCompress.setValue(false);
KafkaInputPTransformRuntime inputRuntime = new KafkaInputPTransformRuntime();
inputRuntime.initialize(null, inputProperties);
KafkaOutputPTransformRuntime outputRuntime = new KafkaOutputPTransformRuntime();
outputRuntime.initialize(null, outputProperties);
// ----------------- pipeline start --------------------
pipeline.apply(inputRuntime).apply(Filter.by(new KafkaCsvBeamRuntimeTestIT.FilterByGroup(testID))).apply(outputRuntime);
PipelineResult result = pipeline.run();
// ----------------- pipeline done --------------------
// ----------------- Read data from TOPIC_AVRO_OUT start --------------------
props = new Properties();
props.put("bootstrap.servers", BOOTSTRAP_HOST);
props.put("group.id", "getResult");
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("value.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");
props.put("auto.offset.reset", "earliest");
KafkaConsumer<String, byte[]> consumer = new KafkaConsumer<>(props);
consumer.subscribe(Arrays.asList(TOPIC_AVRO_OUT));
List<Person> results = new ArrayList<>();
while (true) {
ConsumerRecords<String, byte[]> records = consumer.poll(100);
for (ConsumerRecord<String, byte[]> record : records) {
Person person = Person.desFromAvroBytes(record.value());
if (testID.equals(person.group)) {
results.add(person);
}
}
if (results.size() >= maxRecords) {
break;
}
}
// ----------------- Read data from TOPIC_AVRO_OUT done --------------------
assertEquals(expectedPersons, results);
}
use of org.apache.beam.sdk.PipelineResult in project components by Talend.
the class KafkaAvroBeamRuntimeTestIT method avroBasicTest2.
/**
* Read avro(Person) format and write avro(Person) format with schema.
*/
@Test
public void avroBasicTest2() throws IOException {
String testID = "avroBasicTest2" + new Random().nextInt();
expectedPersons = Person.genRandomList(testID, maxRecords);
// ----------------- Send data to TOPIC_AVRO_IN start --------------------
Properties props = new Properties();
props.put("bootstrap.servers", BOOTSTRAP_HOST);
props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
props.put("value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer");
Producer<Void, byte[]> producer = new KafkaProducer<>(props);
for (Person person : expectedPersons) {
ProducerRecord<Void, byte[]> message = new ProducerRecord<>(TOPIC_AVRO_IN, person.serToAvroBytes());
producer.send(message);
}
producer.close();
// ----------------- Send data to TOPIC_AVRO_IN done --------------------
KafkaInputProperties inputProperties = new KafkaInputProperties("input");
inputProperties.init();
inputProperties.setDatasetProperties(inputDatasetProperties);
inputProperties.autoOffsetReset.setValue(KafkaInputProperties.OffsetType.EARLIEST);
inputProperties.useMaxNumRecords.setValue(false);
// inputProperties.maxNumRecords.setValue(maxRecords.longValue());
inputProperties.useMaxReadTime.setValue(true);
inputProperties.maxReadTime.setValue(5000l);
KafkaOutputProperties outputProperties = new KafkaOutputProperties("output");
outputProperties.init();
outputProperties.setDatasetProperties(outputDatasetProperties);
outputProperties.partitionType.setValue(KafkaOutputProperties.PartitionType.COLUMN);
outputProperties.keyColumn.setValue("name");
outputProperties.useCompress.setValue(false);
KafkaInputPTransformRuntime inputRuntime = new KafkaInputPTransformRuntime();
inputRuntime.initialize(null, inputProperties);
KafkaOutputPTransformRuntime outputRuntime = new KafkaOutputPTransformRuntime();
outputRuntime.initialize(null, outputProperties);
// ----------------- pipeline start --------------------
pipeline.apply(inputRuntime).apply(Filter.by(new KafkaCsvBeamRuntimeTestIT.FilterByGroup(testID))).apply(outputRuntime);
PipelineResult result = pipeline.run();
// ----------------- pipeline done --------------------
// ----------------- Read data from TOPIC_AVRO_OUT start --------------------
props = new Properties();
props.put("bootstrap.servers", BOOTSTRAP_HOST);
props.put("group.id", "getResult");
props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
props.put("value.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");
props.put("auto.offset.reset", "earliest");
KafkaConsumer<String, byte[]> consumer = new KafkaConsumer<>(props);
consumer.subscribe(Arrays.asList(TOPIC_AVRO_OUT));
List<Person> results = new ArrayList<>();
List<String> keys = new ArrayList<>();
while (true) {
ConsumerRecords<String, byte[]> records = consumer.poll(100);
for (ConsumerRecord<String, byte[]> record : records) {
Person person = Person.desFromAvroBytes(record.value());
if (testID.equals(person.group)) {
keys.add(record.key());
results.add(person);
}
}
if (results.size() >= maxRecords) {
break;
}
}
// ----------------- Read data from TOPIC_AVRO_OUT done --------------------
assertEquals(expectedPersons, results);
List<String> expectedKeys = new ArrayList<>();
for (Person person : results) {
expectedKeys.add(person.name);
}
assertEquals(expectedKeys, keys);
}
use of org.apache.beam.sdk.PipelineResult in project DataflowJavaSDK-examples by GoogleCloudPlatform.
the class WindowedWordCount method main.
public static void main(String[] args) throws IOException {
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
final String output = options.getOutput();
final Instant minTimestamp = new Instant(options.getMinTimestampMillis());
final Instant maxTimestamp = new Instant(options.getMaxTimestampMillis());
Pipeline pipeline = Pipeline.create(options);
/**
* Concept #1: the Beam SDK lets us run the same pipeline with either a bounded or
* unbounded input source.
*/
PCollection<String> input = pipeline.apply(TextIO.read().from(options.getInputFile())).apply(ParDo.of(new AddTimestampFn(minTimestamp, maxTimestamp)));
/**
* Concept #3: Window into fixed windows. The fixed window size for this example defaults to 1
* minute (you can change this with a command-line option). See the documentation for more
* information on how fixed windows work, and for information on the other types of windowing
* available (e.g., sliding windows).
*/
PCollection<String> windowedWords = input.apply(Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getWindowSize()))));
/**
* Concept #4: Re-use our existing CountWords transform that does not have knowledge of
* windows over a PCollection containing windowed values.
*/
PCollection<KV<String, Long>> wordCounts = windowedWords.apply(new WordCount.CountWords());
/**
* Concept #5: Format the results and write to a sharded file partitioned by window, using a
* simple ParDo operation. Because there may be failures followed by retries, the
* writes must be idempotent, but the details of writing to files is elided here.
*/
wordCounts.apply(MapElements.via(new WordCount.FormatAsTextFn())).apply(new WriteOneFilePerWindow(output, options.getNumShards()));
PipelineResult result = pipeline.run();
try {
result.waitUntilFinish();
} catch (Exception exc) {
result.cancel();
}
}
use of org.apache.beam.sdk.PipelineResult in project component-runtime by Talend.
the class InMemoryQueueIOTest method input.
@Test(timeout = 60000)
public void input() {
INPUT_OUTPUTS.clear();
final PipelineResult result;
try (final LoopState state = LoopState.newTracker(null)) {
IntStream.range(0, 2).forEach(i -> state.push(new RowStruct(i)));
pipeline.apply(InMemoryQueueIO.from(state)).apply(ParDo.of(new DoFn<JsonObject, Void>() {
@ProcessElement
public void onElement(final ProcessContext context) {
INPUT_OUTPUTS.add(context.element());
}
}));
result = pipeline.run();
IntStream.range(2, 5).forEach(i -> state.push(new RowStruct(i)));
// for inputs it is key to notify beam we are done
state.end();
final long end = System.currentTimeMillis() + TimeUnit.MINUTES.toMillis(2);
while (INPUT_OUTPUTS.size() < 5 && end - System.currentTimeMillis() >= 0) {
try {
sleep(150);
} catch (final InterruptedException e) {
Thread.currentThread().interrupt();
}
}
}
result.waitUntilFinish();
assertEquals(5, INPUT_OUTPUTS.size());
assertEquals(IntStream.range(0, 5).boxed().collect(toSet()), INPUT_OUTPUTS.stream().mapToInt(o -> o.getInt("id")).boxed().collect(toSet()));
}
use of org.apache.beam.sdk.PipelineResult in project beam by apache.
the class AvroIOIT method writeThenReadAll.
@Test
public void writeThenReadAll() {
PCollection<String> testFilenames = pipeline.apply("Generate sequence", GenerateSequence.from(0).to(numberOfTextLines)).apply("Produce text lines", ParDo.of(new FileBasedIOITHelper.DeterministicallyConstructTestTextLineFn())).apply("Produce Avro records", ParDo.of(new DeterministicallyConstructAvroRecordsFn())).setCoder(AvroCoder.of(AVRO_SCHEMA)).apply("Collect start time", ParDo.of(new TimeMonitor<>(AVRO_NAMESPACE, "writeStart"))).apply("Write Avro records to files", AvroIO.writeGenericRecords(AVRO_SCHEMA).to(filenamePrefix).withOutputFilenames().withSuffix(".avro")).getPerDestinationOutputFilenames().apply("Collect middle time", ParDo.of(new TimeMonitor<>(AVRO_NAMESPACE, "middlePoint"))).apply(Values.create());
PCollection<String> consolidatedHashcode = testFilenames.apply("Match all files", FileIO.matchAll()).apply("Read matches", FileIO.readMatches().withDirectoryTreatment(DirectoryTreatment.PROHIBIT)).apply("Read files", AvroIO.readFilesGenericRecords(AVRO_SCHEMA)).apply("Collect end time", ParDo.of(new TimeMonitor<>(AVRO_NAMESPACE, "endPoint"))).apply("Parse Avro records to Strings", ParDo.of(new ParseAvroRecordsFn())).apply("Calculate hashcode", Combine.globally(new HashingFn()));
PAssert.thatSingleton(consolidatedHashcode).isEqualTo(expectedHash);
testFilenames.apply("Delete test files", ParDo.of(new DeleteFileFn()).withSideInputs(consolidatedHashcode.apply(View.asSingleton())));
PipelineResult result = pipeline.run();
result.waitUntilFinish();
collectAndPublishMetrics(result);
}
Aggregations