Search in sources :

Example 11 with PipelineResult

use of org.apache.beam.sdk.PipelineResult in project components by Talend.

the class KafkaAvroBeamRuntimeTestIT method avroBasicTest.

/**
 * Read avro(Person) format and write avro(Person) format with schema.
 */
@Test
public void avroBasicTest() throws IOException {
    String testID = "avroBasicTest" + new Random().nextInt();
    expectedPersons = Person.genRandomList(testID, maxRecords);
    // ----------------- Send data to TOPIC_AVRO_IN start --------------------
    Properties props = new Properties();
    props.put("bootstrap.servers", BOOTSTRAP_HOST);
    props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
    props.put("value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer");
    Producer<Void, byte[]> producer = new KafkaProducer<>(props);
    for (Person person : expectedPersons) {
        ProducerRecord<Void, byte[]> message = new ProducerRecord<>(TOPIC_AVRO_IN, person.serToAvroBytes());
        producer.send(message);
    }
    producer.close();
    // ----------------- Send data to TOPIC_AVRO_IN done --------------------
    KafkaInputProperties inputProperties = new KafkaInputProperties("input");
    inputProperties.init();
    inputProperties.setDatasetProperties(inputDatasetProperties);
    inputProperties.autoOffsetReset.setValue(KafkaInputProperties.OffsetType.EARLIEST);
    inputProperties.useMaxNumRecords.setValue(false);
    // inputProperties.maxNumRecords.setValue(maxRecords.longValue());
    inputProperties.useMaxReadTime.setValue(true);
    inputProperties.maxReadTime.setValue(5000l);
    KafkaOutputProperties outputProperties = new KafkaOutputProperties("output");
    outputProperties.init();
    outputProperties.setDatasetProperties(outputDatasetProperties);
    outputProperties.partitionType.setValue(KafkaOutputProperties.PartitionType.ROUND_ROBIN);
    outputProperties.useCompress.setValue(false);
    KafkaInputPTransformRuntime inputRuntime = new KafkaInputPTransformRuntime();
    inputRuntime.initialize(null, inputProperties);
    KafkaOutputPTransformRuntime outputRuntime = new KafkaOutputPTransformRuntime();
    outputRuntime.initialize(null, outputProperties);
    // ----------------- pipeline start --------------------
    pipeline.apply(inputRuntime).apply(Filter.by(new KafkaCsvBeamRuntimeTestIT.FilterByGroup(testID))).apply(outputRuntime);
    PipelineResult result = pipeline.run();
    // ----------------- pipeline done --------------------
    // ----------------- Read data from TOPIC_AVRO_OUT start --------------------
    props = new Properties();
    props.put("bootstrap.servers", BOOTSTRAP_HOST);
    props.put("group.id", "getResult");
    props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
    props.put("value.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");
    props.put("auto.offset.reset", "earliest");
    KafkaConsumer<String, byte[]> consumer = new KafkaConsumer<>(props);
    consumer.subscribe(Arrays.asList(TOPIC_AVRO_OUT));
    List<Person> results = new ArrayList<>();
    while (true) {
        ConsumerRecords<String, byte[]> records = consumer.poll(100);
        for (ConsumerRecord<String, byte[]> record : records) {
            Person person = Person.desFromAvroBytes(record.value());
            if (testID.equals(person.group)) {
                results.add(person);
            }
        }
        if (results.size() >= maxRecords) {
            break;
        }
    }
    // ----------------- Read data from TOPIC_AVRO_OUT done --------------------
    assertEquals(expectedPersons, results);
}
Also used : KafkaProducer(org.apache.kafka.clients.producer.KafkaProducer) KafkaInputProperties(org.talend.components.kafka.input.KafkaInputProperties) ArrayList(java.util.ArrayList) PipelineResult(org.apache.beam.sdk.PipelineResult) KafkaConsumer(org.apache.kafka.clients.consumer.KafkaConsumer) KafkaInputProperties(org.talend.components.kafka.input.KafkaInputProperties) KafkaDatasetProperties(org.talend.components.kafka.dataset.KafkaDatasetProperties) Properties(java.util.Properties) KafkaDatastoreProperties(org.talend.components.kafka.datastore.KafkaDatastoreProperties) KafkaOutputProperties(org.talend.components.kafka.output.KafkaOutputProperties) KafkaOutputProperties(org.talend.components.kafka.output.KafkaOutputProperties) Random(java.util.Random) ProducerRecord(org.apache.kafka.clients.producer.ProducerRecord) Test(org.junit.Test)

Example 12 with PipelineResult

use of org.apache.beam.sdk.PipelineResult in project components by Talend.

the class KafkaAvroBeamRuntimeTestIT method avroBasicTest2.

/**
 * Read avro(Person) format and write avro(Person) format with schema.
 */
@Test
public void avroBasicTest2() throws IOException {
    String testID = "avroBasicTest2" + new Random().nextInt();
    expectedPersons = Person.genRandomList(testID, maxRecords);
    // ----------------- Send data to TOPIC_AVRO_IN start --------------------
    Properties props = new Properties();
    props.put("bootstrap.servers", BOOTSTRAP_HOST);
    props.put("key.serializer", "org.apache.kafka.common.serialization.StringSerializer");
    props.put("value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer");
    Producer<Void, byte[]> producer = new KafkaProducer<>(props);
    for (Person person : expectedPersons) {
        ProducerRecord<Void, byte[]> message = new ProducerRecord<>(TOPIC_AVRO_IN, person.serToAvroBytes());
        producer.send(message);
    }
    producer.close();
    // ----------------- Send data to TOPIC_AVRO_IN done --------------------
    KafkaInputProperties inputProperties = new KafkaInputProperties("input");
    inputProperties.init();
    inputProperties.setDatasetProperties(inputDatasetProperties);
    inputProperties.autoOffsetReset.setValue(KafkaInputProperties.OffsetType.EARLIEST);
    inputProperties.useMaxNumRecords.setValue(false);
    // inputProperties.maxNumRecords.setValue(maxRecords.longValue());
    inputProperties.useMaxReadTime.setValue(true);
    inputProperties.maxReadTime.setValue(5000l);
    KafkaOutputProperties outputProperties = new KafkaOutputProperties("output");
    outputProperties.init();
    outputProperties.setDatasetProperties(outputDatasetProperties);
    outputProperties.partitionType.setValue(KafkaOutputProperties.PartitionType.COLUMN);
    outputProperties.keyColumn.setValue("name");
    outputProperties.useCompress.setValue(false);
    KafkaInputPTransformRuntime inputRuntime = new KafkaInputPTransformRuntime();
    inputRuntime.initialize(null, inputProperties);
    KafkaOutputPTransformRuntime outputRuntime = new KafkaOutputPTransformRuntime();
    outputRuntime.initialize(null, outputProperties);
    // ----------------- pipeline start --------------------
    pipeline.apply(inputRuntime).apply(Filter.by(new KafkaCsvBeamRuntimeTestIT.FilterByGroup(testID))).apply(outputRuntime);
    PipelineResult result = pipeline.run();
    // ----------------- pipeline done --------------------
    // ----------------- Read data from TOPIC_AVRO_OUT start --------------------
    props = new Properties();
    props.put("bootstrap.servers", BOOTSTRAP_HOST);
    props.put("group.id", "getResult");
    props.put("key.deserializer", "org.apache.kafka.common.serialization.StringDeserializer");
    props.put("value.deserializer", "org.apache.kafka.common.serialization.ByteArrayDeserializer");
    props.put("auto.offset.reset", "earliest");
    KafkaConsumer<String, byte[]> consumer = new KafkaConsumer<>(props);
    consumer.subscribe(Arrays.asList(TOPIC_AVRO_OUT));
    List<Person> results = new ArrayList<>();
    List<String> keys = new ArrayList<>();
    while (true) {
        ConsumerRecords<String, byte[]> records = consumer.poll(100);
        for (ConsumerRecord<String, byte[]> record : records) {
            Person person = Person.desFromAvroBytes(record.value());
            if (testID.equals(person.group)) {
                keys.add(record.key());
                results.add(person);
            }
        }
        if (results.size() >= maxRecords) {
            break;
        }
    }
    // ----------------- Read data from TOPIC_AVRO_OUT done --------------------
    assertEquals(expectedPersons, results);
    List<String> expectedKeys = new ArrayList<>();
    for (Person person : results) {
        expectedKeys.add(person.name);
    }
    assertEquals(expectedKeys, keys);
}
Also used : KafkaProducer(org.apache.kafka.clients.producer.KafkaProducer) KafkaInputProperties(org.talend.components.kafka.input.KafkaInputProperties) ArrayList(java.util.ArrayList) PipelineResult(org.apache.beam.sdk.PipelineResult) KafkaConsumer(org.apache.kafka.clients.consumer.KafkaConsumer) KafkaInputProperties(org.talend.components.kafka.input.KafkaInputProperties) KafkaDatasetProperties(org.talend.components.kafka.dataset.KafkaDatasetProperties) Properties(java.util.Properties) KafkaDatastoreProperties(org.talend.components.kafka.datastore.KafkaDatastoreProperties) KafkaOutputProperties(org.talend.components.kafka.output.KafkaOutputProperties) KafkaOutputProperties(org.talend.components.kafka.output.KafkaOutputProperties) Random(java.util.Random) ProducerRecord(org.apache.kafka.clients.producer.ProducerRecord) Test(org.junit.Test)

Example 13 with PipelineResult

use of org.apache.beam.sdk.PipelineResult in project DataflowJavaSDK-examples by GoogleCloudPlatform.

the class WindowedWordCount method main.

public static void main(String[] args) throws IOException {
    Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
    final String output = options.getOutput();
    final Instant minTimestamp = new Instant(options.getMinTimestampMillis());
    final Instant maxTimestamp = new Instant(options.getMaxTimestampMillis());
    Pipeline pipeline = Pipeline.create(options);
    /**
     * Concept #1: the Beam SDK lets us run the same pipeline with either a bounded or
     * unbounded input source.
     */
    PCollection<String> input = pipeline.apply(TextIO.read().from(options.getInputFile())).apply(ParDo.of(new AddTimestampFn(minTimestamp, maxTimestamp)));
    /**
     * Concept #3: Window into fixed windows. The fixed window size for this example defaults to 1
     * minute (you can change this with a command-line option). See the documentation for more
     * information on how fixed windows work, and for information on the other types of windowing
     * available (e.g., sliding windows).
     */
    PCollection<String> windowedWords = input.apply(Window.<String>into(FixedWindows.of(Duration.standardMinutes(options.getWindowSize()))));
    /**
     * Concept #4: Re-use our existing CountWords transform that does not have knowledge of
     * windows over a PCollection containing windowed values.
     */
    PCollection<KV<String, Long>> wordCounts = windowedWords.apply(new WordCount.CountWords());
    /**
     * Concept #5: Format the results and write to a sharded file partitioned by window, using a
     * simple ParDo operation. Because there may be failures followed by retries, the
     * writes must be idempotent, but the details of writing to files is elided here.
     */
    wordCounts.apply(MapElements.via(new WordCount.FormatAsTextFn())).apply(new WriteOneFilePerWindow(output, options.getNumShards()));
    PipelineResult result = pipeline.run();
    try {
        result.waitUntilFinish();
    } catch (Exception exc) {
        result.cancel();
    }
}
Also used : ExampleBigQueryTableOptions(com.google.cloud.dataflow.examples.common.ExampleBigQueryTableOptions) ExampleOptions(com.google.cloud.dataflow.examples.common.ExampleOptions) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) WriteOneFilePerWindow(com.google.cloud.dataflow.examples.common.WriteOneFilePerWindow) Instant(org.joda.time.Instant) PipelineResult(org.apache.beam.sdk.PipelineResult) KV(org.apache.beam.sdk.values.KV) IOException(java.io.IOException) Pipeline(org.apache.beam.sdk.Pipeline)

Example 14 with PipelineResult

use of org.apache.beam.sdk.PipelineResult in project component-runtime by Talend.

the class InMemoryQueueIOTest method input.

@Test(timeout = 60000)
public void input() {
    INPUT_OUTPUTS.clear();
    final PipelineResult result;
    try (final LoopState state = LoopState.newTracker(null)) {
        IntStream.range(0, 2).forEach(i -> state.push(new RowStruct(i)));
        pipeline.apply(InMemoryQueueIO.from(state)).apply(ParDo.of(new DoFn<JsonObject, Void>() {

            @ProcessElement
            public void onElement(final ProcessContext context) {
                INPUT_OUTPUTS.add(context.element());
            }
        }));
        result = pipeline.run();
        IntStream.range(2, 5).forEach(i -> state.push(new RowStruct(i)));
        // for inputs it is key to notify beam we are done
        state.end();
        final long end = System.currentTimeMillis() + TimeUnit.MINUTES.toMillis(2);
        while (INPUT_OUTPUTS.size() < 5 && end - System.currentTimeMillis() >= 0) {
            try {
                sleep(150);
            } catch (final InterruptedException e) {
                Thread.currentThread().interrupt();
            }
        }
    }
    result.waitUntilFinish();
    assertEquals(5, INPUT_OUTPUTS.size());
    assertEquals(IntStream.range(0, 5).boxed().collect(toSet()), INPUT_OUTPUTS.stream().mapToInt(o -> o.getInt("id")).boxed().collect(toSet()));
}
Also used : IntStream(java.util.stream.IntStream) DoFn(org.apache.beam.sdk.transforms.DoFn) JsonObject(javax.json.JsonObject) SerializableCoder(org.apache.beam.sdk.coders.SerializableCoder) Collection(java.util.Collection) PipelineResult(org.apache.beam.sdk.PipelineResult) Test(org.junit.Test) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) Serializable(java.io.Serializable) ArrayList(java.util.ArrayList) TimeUnit(java.util.concurrent.TimeUnit) Collectors.toList(java.util.stream.Collectors.toList) Rule(org.junit.Rule) ParDo(org.apache.beam.sdk.transforms.ParDo) Create(org.apache.beam.sdk.transforms.Create) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Data(lombok.Data) Thread.sleep(java.lang.Thread.sleep) Assertions.assertEquals(org.junit.jupiter.api.Assertions.assertEquals) ComponentManager(org.talend.sdk.component.runtime.manager.ComponentManager) AllArgsConstructor(lombok.AllArgsConstructor) JsonpJsonObjectCoder(org.talend.sdk.component.runtime.beam.coder.JsonpJsonObjectCoder) Collectors.toSet(java.util.stream.Collectors.toSet) CopyOnWriteArrayList(java.util.concurrent.CopyOnWriteArrayList) DoFn(org.apache.beam.sdk.transforms.DoFn) PipelineResult(org.apache.beam.sdk.PipelineResult) Test(org.junit.Test)

Example 15 with PipelineResult

use of org.apache.beam.sdk.PipelineResult in project beam by apache.

the class AvroIOIT method writeThenReadAll.

@Test
public void writeThenReadAll() {
    PCollection<String> testFilenames = pipeline.apply("Generate sequence", GenerateSequence.from(0).to(numberOfTextLines)).apply("Produce text lines", ParDo.of(new FileBasedIOITHelper.DeterministicallyConstructTestTextLineFn())).apply("Produce Avro records", ParDo.of(new DeterministicallyConstructAvroRecordsFn())).setCoder(AvroCoder.of(AVRO_SCHEMA)).apply("Collect start time", ParDo.of(new TimeMonitor<>(AVRO_NAMESPACE, "writeStart"))).apply("Write Avro records to files", AvroIO.writeGenericRecords(AVRO_SCHEMA).to(filenamePrefix).withOutputFilenames().withSuffix(".avro")).getPerDestinationOutputFilenames().apply("Collect middle time", ParDo.of(new TimeMonitor<>(AVRO_NAMESPACE, "middlePoint"))).apply(Values.create());
    PCollection<String> consolidatedHashcode = testFilenames.apply("Match all files", FileIO.matchAll()).apply("Read matches", FileIO.readMatches().withDirectoryTreatment(DirectoryTreatment.PROHIBIT)).apply("Read files", AvroIO.readFilesGenericRecords(AVRO_SCHEMA)).apply("Collect end time", ParDo.of(new TimeMonitor<>(AVRO_NAMESPACE, "endPoint"))).apply("Parse Avro records to Strings", ParDo.of(new ParseAvroRecordsFn())).apply("Calculate hashcode", Combine.globally(new HashingFn()));
    PAssert.thatSingleton(consolidatedHashcode).isEqualTo(expectedHash);
    testFilenames.apply("Delete test files", ParDo.of(new DeleteFileFn()).withSideInputs(consolidatedHashcode.apply(View.asSingleton())));
    PipelineResult result = pipeline.run();
    result.waitUntilFinish();
    collectAndPublishMetrics(result);
}
Also used : TimeMonitor(org.apache.beam.sdk.testutils.metrics.TimeMonitor) FileBasedIOITHelper(org.apache.beam.sdk.io.common.FileBasedIOITHelper) PipelineResult(org.apache.beam.sdk.PipelineResult) HashingFn(org.apache.beam.sdk.io.common.HashingFn) DeleteFileFn(org.apache.beam.sdk.io.common.FileBasedIOITHelper.DeleteFileFn) Test(org.junit.Test)

Aggregations

PipelineResult (org.apache.beam.sdk.PipelineResult)105 Test (org.junit.Test)66 Pipeline (org.apache.beam.sdk.Pipeline)29 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)18 PCollection (org.apache.beam.sdk.values.PCollection)18 TimeMonitor (org.apache.beam.sdk.testutils.metrics.TimeMonitor)14 ArrayList (java.util.ArrayList)12 Category (org.junit.experimental.categories.Category)12 KV (org.apache.beam.sdk.values.KV)11 Rule (org.junit.Rule)11 IOException (java.io.IOException)10 ExampleUtils (org.apache.beam.examples.common.ExampleUtils)10 DoFn (org.apache.beam.sdk.transforms.DoFn)10 HashingFn (org.apache.beam.sdk.io.common.HashingFn)9 RunWith (org.junit.runner.RunWith)9 MetricQueryResults (org.apache.beam.sdk.metrics.MetricQueryResults)8 ParDo (org.apache.beam.sdk.transforms.ParDo)8 Duration (org.joda.time.Duration)8 Map (java.util.Map)7 TableReference (com.google.api.services.bigquery.model.TableReference)6