Examples with Pipeline - org.apache.beam.sdk.Pipeline

Example 66 with Pipeline

use of org.apache.beam.sdk.Pipeline in project beam by apache.

the class ParDoTranslatorTest method testAssertionFailure.

@Test
public void testAssertionFailure() throws Exception {
    ApexPipelineOptions options = PipelineOptionsFactory.create().as(ApexPipelineOptions.class);
    options.setRunner(TestApexRunner.class);
    Pipeline pipeline = Pipeline.create(options);
    PCollection<Integer> pcollection = pipeline.apply(Create.of(1, 2, 3, 4));
    PAssert.that(pcollection).containsInAnyOrder(2, 1, 4, 3, 7);
    Throwable exc = runExpectingAssertionFailure(pipeline);
    Pattern expectedPattern = Pattern.compile("Expected: iterable over \\[((<4>|<7>|<3>|<2>|<1>)(, )?){5}\\] in any order");
    // A loose pattern, but should get the job done.
    assertTrue("Expected error message from PAssert with substring matching " + expectedPattern + " but the message was \"" + exc.getMessage() + "\"", expectedPattern.matcher(exc.getMessage()).find());
}

Also used : Pattern(java.util.regex.Pattern) ApexPipelineOptions(org.apache.beam.runners.apex.ApexPipelineOptions) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 67 with Pipeline

use of org.apache.beam.sdk.Pipeline in project beam by apache.

the class ParDoTranslatorTest method testContainsInAnyOrder.

@Test
public void testContainsInAnyOrder() throws Exception {
    ApexPipelineOptions options = PipelineOptionsFactory.create().as(ApexPipelineOptions.class);
    options.setRunner(TestApexRunner.class);
    Pipeline pipeline = Pipeline.create(options);
    PCollection<Integer> pcollection = pipeline.apply(Create.of(1, 2, 3, 4));
    PAssert.that(pcollection).containsInAnyOrder(2, 1, 4, 3);
    // TODO: terminate faster based on processed assertion vs. auto-shutdown
    pipeline.run();
}

Also used : ApexPipelineOptions(org.apache.beam.runners.apex.ApexPipelineOptions) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 68 with Pipeline

use of org.apache.beam.sdk.Pipeline in project beam by apache.

the class ParDoTranslatorTest method testMultiOutputParDoWithSideInputs.

@Test
public void testMultiOutputParDoWithSideInputs() throws Exception {
    ApexPipelineOptions options = PipelineOptionsFactory.create().as(ApexPipelineOptions.class);
    // non-blocking run
    options.setRunner(ApexRunner.class);
    Pipeline pipeline = Pipeline.create(options);
    List<Integer> inputs = Arrays.asList(3, -42, 666);
    final TupleTag<String> mainOutputTag = new TupleTag<>("main");
    final TupleTag<Void> additionalOutputTag = new TupleTag<>("output");
    PCollectionView<Integer> sideInput1 = pipeline.apply("CreateSideInput1", Create.of(11)).apply("ViewSideInput1", View.<Integer>asSingleton());
    PCollectionView<Integer> sideInputUnread = pipeline.apply("CreateSideInputUnread", Create.of(-3333)).apply("ViewSideInputUnread", View.<Integer>asSingleton());
    PCollectionView<Integer> sideInput2 = pipeline.apply("CreateSideInput2", Create.of(222)).apply("ViewSideInput2", View.<Integer>asSingleton());
    PCollectionTuple outputs = pipeline.apply(Create.of(inputs)).apply(ParDo.of(new TestMultiOutputWithSideInputsFn(Arrays.asList(sideInput1, sideInput2), Arrays.<TupleTag<String>>asList())).withSideInputs(sideInput1).withSideInputs(sideInputUnread).withSideInputs(sideInput2).withOutputTags(mainOutputTag, TupleTagList.of(additionalOutputTag)));
    outputs.get(mainOutputTag).apply(ParDo.of(new EmbeddedCollector()));
    outputs.get(additionalOutputTag).setCoder(VoidCoder.of());
    ApexRunnerResult result = (ApexRunnerResult) pipeline.run();
    HashSet<String> expected = Sets.newHashSet("processing: 3: [11, 222]", "processing: -42: [11, 222]", "processing: 666: [11, 222]");
    long timeout = System.currentTimeMillis() + TIMEOUT_MILLIS;
    while (System.currentTimeMillis() < timeout) {
        if (EmbeddedCollector.RESULTS.containsAll(expected)) {
            break;
        }
        LOG.info("Waiting for expected results.");
        Thread.sleep(SLEEP_MILLIS);
    }
    result.cancel();
    Assert.assertEquals(Sets.newHashSet(expected), EmbeddedCollector.RESULTS);
}

Also used : TupleTag(org.apache.beam.sdk.values.TupleTag) ApexRunnerResult(org.apache.beam.runners.apex.ApexRunnerResult) Pipeline(org.apache.beam.sdk.Pipeline) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) ApexPipelineOptions(org.apache.beam.runners.apex.ApexPipelineOptions) Test(org.junit.Test)

Example 69 with Pipeline

use of org.apache.beam.sdk.Pipeline in project beam by apache.

the class GroupByNullKeyTest method testProgram.

@Override
protected void testProgram() throws Exception {
    Pipeline p = FlinkTestPipeline.createForStreaming();
    PCollection<String> output = p.apply(Create.of(Arrays.asList(KV.<Integer, String>of(0, "user1"), KV.<Integer, String>of(1, "user1"), KV.<Integer, String>of(2, "user1"), KV.<Integer, String>of(10, "user2"), KV.<Integer, String>of(1, "user2"), KV.<Integer, String>of(15000, "user2"), KV.<Integer, String>of(12000, "user2"), KV.<Integer, String>of(25000, "user3")))).apply(ParDo.of(new ExtractUserAndTimestamp())).apply(Window.<String>into(FixedWindows.of(Duration.standardHours(1))).triggering(AfterWatermark.pastEndOfWindow()).withAllowedLateness(Duration.ZERO).discardingFiredPanes()).apply(ParDo.of(new DoFn<String, KV<Void, String>>() {

        @ProcessElement
        public void processElement(ProcessContext c) throws Exception {
            String elem = c.element();
            c.output(KV.<Void, String>of(null, elem));
        }
    })).apply(GroupByKey.<Void, String>create()).apply(ParDo.of(new DoFn<KV<Void, Iterable<String>>, String>() {

        @ProcessElement
        public void processElement(ProcessContext c) throws Exception {
            KV<Void, Iterable<String>> elem = c.element();
            StringBuilder str = new StringBuilder();
            str.append("k: " + elem.getKey() + " v:");
            for (String v : elem.getValue()) {
                str.append(" " + v);
            }
            c.output(str.toString());
        }
    }));
    output.apply(TextIO.write().to(resultPath));
    p.run();
}

Also used : DoFn(org.apache.beam.sdk.transforms.DoFn) FlinkTestPipeline(org.apache.beam.runners.flink.FlinkTestPipeline) Pipeline(org.apache.beam.sdk.Pipeline)

Example 70 with Pipeline

use of org.apache.beam.sdk.Pipeline in project beam by apache.

the class TopWikipediaSessionsITCase method testProgram.

@Override
protected void testProgram() throws Exception {
    Pipeline p = FlinkTestPipeline.createForStreaming();
    Long now = (System.currentTimeMillis() + 10000) / 1000;
    PCollection<KV<String, Long>> output = p.apply(Create.of(Arrays.asList(new TableRow().set("timestamp", now).set("contributor_username", "user1"), new TableRow().set("timestamp", now + 10).set("contributor_username", "user3"), new TableRow().set("timestamp", now).set("contributor_username", "user2"), new TableRow().set("timestamp", now).set("contributor_username", "user1"), new TableRow().set("timestamp", now + 2).set("contributor_username", "user1"), new TableRow().set("timestamp", now).set("contributor_username", "user2"), new TableRow().set("timestamp", now + 1).set("contributor_username", "user2"), new TableRow().set("timestamp", now + 5).set("contributor_username", "user2"), new TableRow().set("timestamp", now + 7).set("contributor_username", "user2"), new TableRow().set("timestamp", now + 8).set("contributor_username", "user2"), new TableRow().set("timestamp", now + 200).set("contributor_username", "user2"), new TableRow().set("timestamp", now + 230).set("contributor_username", "user1"), new TableRow().set("timestamp", now + 230).set("contributor_username", "user2"), new TableRow().set("timestamp", now + 240).set("contributor_username", "user2"), new TableRow().set("timestamp", now + 245).set("contributor_username", "user3"), new TableRow().set("timestamp", now + 235).set("contributor_username", "user3"), new TableRow().set("timestamp", now + 236).set("contributor_username", "user3"), new TableRow().set("timestamp", now + 237).set("contributor_username", "user3"), new TableRow().set("timestamp", now + 238).set("contributor_username", "user3"), new TableRow().set("timestamp", now + 239).set("contributor_username", "user3"), new TableRow().set("timestamp", now + 240).set("contributor_username", "user3"), new TableRow().set("timestamp", now + 241).set("contributor_username", "user2"), new TableRow().set("timestamp", now).set("contributor_username", "user3")))).apply(ParDo.of(new DoFn<TableRow, String>() {

        @ProcessElement
        public void processElement(ProcessContext c) throws Exception {
            TableRow row = c.element();
            long timestamp = (Integer) row.get("timestamp");
            String userName = (String) row.get("contributor_username");
            if (userName != null) {
                // Sets the timestamp field to be used in windowing.
                c.outputWithTimestamp(userName, new Instant(timestamp * 1000L));
            }
        }
    })).apply(Window.<String>into(Sessions.withGapDuration(Duration.standardMinutes(1)))).apply(Count.<String>perElement());
    PCollection<String> format = output.apply(ParDo.of(new DoFn<KV<String, Long>, String>() {

        @ProcessElement
        public void processElement(ProcessContext c) throws Exception {
            KV<String, Long> el = c.element();
            String out = "user: " + el.getKey() + " value:" + el.getValue();
            c.output(out);
        }
    }));
    format.apply(TextIO.write().to(resultPath));
    p.run();
}

Also used : Instant(org.joda.time.Instant) KV(org.apache.beam.sdk.values.KV) FlinkTestPipeline(org.apache.beam.runners.flink.FlinkTestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) DoFn(org.apache.beam.sdk.transforms.DoFn) TableRow(com.google.api.services.bigquery.model.TableRow)

Aggregations

Pipeline (org.apache.beam.sdk.Pipeline)184 Test (org.junit.Test)123 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)86 DataflowPipelineOptions (org.apache.beam.runners.dataflow.options.DataflowPipelineOptions)39 KV (org.apache.beam.sdk.values.KV)35 Job (com.google.api.services.dataflow.model.Job)26 DoFn (org.apache.beam.sdk.transforms.DoFn)24 PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)22 DataflowPackage (com.google.api.services.dataflow.model.DataflowPackage)21 TableRow (com.google.api.services.bigquery.model.TableRow)16 PipelineResult (org.apache.beam.sdk.PipelineResult)14 Structs.getString (org.apache.beam.runners.dataflow.util.Structs.getString)13 TableSchema (com.google.api.services.bigquery.model.TableSchema)12 ApexPipelineOptions (org.apache.beam.runners.apex.ApexPipelineOptions)12 Map (java.util.Map)11 TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)10 ArrayList (java.util.ArrayList)10 Instant (org.joda.time.Instant)10 TableReference (com.google.api.services.bigquery.model.TableReference)9 JsonSchemaToTableSchema (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema)9