Examples with Pipeline - org.apache.beam.sdk.Pipeline

Example 31 with Pipeline

use of org.apache.beam.sdk.Pipeline in project beam by apache.

the class BigQueryIOTest method testBuildSourceWithTableAndSqlDialect.

@Test
public void testBuildSourceWithTableAndSqlDialect() {
    BigQueryOptions bqOptions = PipelineOptionsFactory.as(BigQueryOptions.class);
    bqOptions.setProject("defaultproject");
    bqOptions.setTempLocation("gs://testbucket/testdir");
    Pipeline p = TestPipeline.create(bqOptions);
    thrown.expect(IllegalStateException.class);
    thrown.expectMessage("Invalid BigQueryIO.Read: Specifies a table with a SQL dialect preference," + " which only applies to queries");
    p.apply(BigQueryIO.read().from("foo.com:project:somedataset.sometable").usingStandardSql());
    p.run();
}

Also used : TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 32 with Pipeline

use of org.apache.beam.sdk.Pipeline in project beam by apache.

the class BigQueryIOTest method testValidateReadSetsDefaultProject.

@Test
public void testValidateReadSetsDefaultProject() throws Exception {
    String projectId = "someproject";
    String datasetId = "somedataset";
    String tableId = "sometable";
    BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
    bqOptions.setProject(projectId);
    Path baseDir = Files.createTempDirectory(tempFolder, "testValidateReadSetsDefaultProject");
    bqOptions.setTempLocation(baseDir.toString());
    FakeDatasetService fakeDatasetService = new FakeDatasetService();
    fakeDatasetService.createDataset(projectId, datasetId, "", "");
    TableReference tableReference = new TableReference().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId);
    fakeDatasetService.createTable(new Table().setTableReference(tableReference).setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))));
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(fakeDatasetService);
    List<TableRow> expected = ImmutableList.of(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L), new TableRow().set("name", "d").set("number", 4L), new TableRow().set("name", "e").set("number", 5L), new TableRow().set("name", "f").set("number", 6L));
    fakeDatasetService.insertAll(tableReference, expected, null);
    Pipeline p = TestPipeline.create(bqOptions);
    TableReference tableRef = new TableReference();
    tableRef.setDatasetId(datasetId);
    tableRef.setTableId(tableId);
    PCollection<KV<String, Long>> output = p.apply(BigQueryIO.read().from(tableRef).withTestServices(fakeBqServices)).apply(ParDo.of(new DoFn<TableRow, KV<String, Long>>() {

        @ProcessElement
        public void processElement(ProcessContext c) throws Exception {
            c.output(KV.of((String) c.element().get("name"), Long.valueOf((String) c.element().get("number"))));
        }
    }));
    PAssert.that(output).containsInAnyOrder(ImmutableList.of(KV.of("a", 1L), KV.of("b", 2L), KV.of("c", 3L), KV.of("d", 4L), KV.of("e", 5L), KV.of("f", 6L)));
    p.run();
}

Also used : Path(java.nio.file.Path) HashBasedTable(com.google.common.collect.HashBasedTable) Table(com.google.api.services.bigquery.model.Table) TableSchema(com.google.api.services.bigquery.model.TableSchema) JsonSchemaToTableSchema(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) KV(org.apache.beam.sdk.values.KV) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) BigQueryHelpers.createTempTableReference(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference) TableReference(com.google.api.services.bigquery.model.TableReference) DoFn(org.apache.beam.sdk.transforms.DoFn) TableRow(com.google.api.services.bigquery.model.TableRow) Test(org.junit.Test)

Example 33 with Pipeline

use of org.apache.beam.sdk.Pipeline in project beam by apache.

the class BigQueryIOTest method testWriteWithDynamicTables.

public void testWriteWithDynamicTables(boolean streaming) throws Exception {
    BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
    bqOptions.setProject("defaultproject");
    bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
    FakeDatasetService datasetService = new FakeDatasetService();
    datasetService.createDataset("project-id", "dataset-id", "", "");
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withDatasetService(datasetService).withJobService(new FakeJobService());
    List<Integer> inserts = new ArrayList<>();
    for (int i = 0; i < 10; i++) {
        inserts.add(i);
    }
    // Create a windowing strategy that puts the input into five different windows depending on
    // record value.
    WindowFn<Integer, PartitionedGlobalWindow> windowFn = new PartitionedGlobalWindows(new SerializableFunction<Integer, String>() {

        @Override
        public String apply(Integer i) {
            return Integer.toString(i % 5);
        }
    });
    final Map<Integer, TableDestination> targetTables = Maps.newHashMap();
    Map<String, String> schemas = Maps.newHashMap();
    for (int i = 0; i < 5; i++) {
        TableDestination destination = new TableDestination("project-id:dataset-id" + ".table-id-" + i, "");
        targetTables.put(i, destination);
        // Make sure each target table has its own custom table.
        schemas.put(destination.getTableSpec(), BigQueryHelpers.toJsonString(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER"), new TableFieldSchema().setName("custom_" + i).setType("STRING")))));
    }
    SerializableFunction<ValueInSingleWindow<Integer>, TableDestination> tableFunction = new SerializableFunction<ValueInSingleWindow<Integer>, TableDestination>() {

        @Override
        public TableDestination apply(ValueInSingleWindow<Integer> input) {
            PartitionedGlobalWindow window = (PartitionedGlobalWindow) input.getWindow();
            // Check that we can access the element as well here and that it matches the window.
            checkArgument(window.value.equals(Integer.toString(input.getValue() % 5)), "Incorrect element");
            return targetTables.get(input.getValue() % 5);
        }
    };
    Pipeline p = TestPipeline.create(bqOptions);
    PCollection<Integer> input = p.apply("CreateSource", Create.of(inserts));
    if (streaming) {
        input = input.setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED);
    }
    PCollectionView<Map<String, String>> schemasView = p.apply("CreateSchemaMap", Create.of(schemas)).apply("ViewSchemaAsMap", View.<String, String>asMap());
    input.apply(Window.<Integer>into(windowFn)).apply(BigQueryIO.<Integer>write().to(tableFunction).withFormatFunction(new SerializableFunction<Integer, TableRow>() {

        @Override
        public TableRow apply(Integer i) {
            return new TableRow().set("name", "number" + i).set("number", i);
        }
    }).withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED).withSchemaFromView(schemasView).withTestServices(fakeBqServices).withoutValidation());
    p.run();
    for (int i = 0; i < 5; ++i) {
        String tableId = String.format("table-id-%d", i);
        String tableSpec = String.format("project-id:dataset-id.%s", tableId);
        // Verify that table was created with the correct schema.
        assertThat(BigQueryHelpers.toJsonString(datasetService.getTable(new TableReference().setProjectId("project-id").setDatasetId("dataset-id").setTableId(tableId)).getSchema()), equalTo(schemas.get(tableSpec)));
        // Verify that the table has the expected contents.
        assertThat(datasetService.getAllRows("project-id", "dataset-id", tableId), containsInAnyOrder(new TableRow().set("name", String.format("number%d", i)).set("number", i), new TableRow().set("name", String.format("number%d", i + 5)).set("number", i + 5)));
    }
}

Also used : TableSchema(com.google.api.services.bigquery.model.TableSchema) JsonSchemaToTableSchema(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) ArrayList(java.util.ArrayList) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) BigQueryHelpers.createTempTableReference(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference) TableReference(com.google.api.services.bigquery.model.TableReference) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) TableRow(com.google.api.services.bigquery.model.TableRow) ValueInSingleWindow(org.apache.beam.sdk.values.ValueInSingleWindow) Map(java.util.Map) ImmutableMap(com.google.common.collect.ImmutableMap)

Example 34 with Pipeline

use of org.apache.beam.sdk.Pipeline in project beam by apache.

the class BigQueryIOTest method testBuildSourceWithTableAndFlatten.

@Test
public void testBuildSourceWithTableAndFlatten() {
    BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
    bqOptions.setProject("defaultproject");
    bqOptions.setTempLocation("gs://testbucket/testdir");
    Pipeline p = TestPipeline.create(bqOptions);
    thrown.expect(IllegalStateException.class);
    thrown.expectMessage("Invalid BigQueryIO.Read: Specifies a table with a result flattening preference," + " which only applies to queries");
    p.apply("ReadMyTable", BigQueryIO.read().from("foo.com:project:somedataset.sometable").withoutResultFlattening());
    p.run();
}

Also used : TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Example 35 with Pipeline

use of org.apache.beam.sdk.Pipeline in project beam by apache.

the class BigQueryIOTest method testWriteUnknown.

@Test
public void testWriteUnknown() throws Exception {
    BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
    bqOptions.setProject("defaultproject");
    bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
    FakeDatasetService datasetService = new FakeDatasetService();
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(datasetService);
    datasetService.createDataset("project-id", "dataset-id", "", "");
    Pipeline p = TestPipeline.create(bqOptions);
    p.apply(Create.of(new TableRow().set("name", "a").set("number", 1), new TableRow().set("name", "b").set("number", 2), new TableRow().set("name", "c").set("number", 3)).withCoder(TableRowJsonCoder.of())).apply(BigQueryIO.writeTableRows().to("project-id:dataset-id.table-id").withCreateDisposition(CreateDisposition.CREATE_NEVER).withTestServices(fakeBqServices).withoutValidation());
    thrown.expect(RuntimeException.class);
    thrown.expectMessage("Failed to create load job");
    try {
        p.run();
    } finally {
        File tempDir = new File(bqOptions.getTempLocation());
        testNumFiles(tempDir, 0);
    }
}

Also used : TableRow(com.google.api.services.bigquery.model.TableRow) File(java.io.File) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) Test(org.junit.Test)

Aggregations

Pipeline (org.apache.beam.sdk.Pipeline)184 Test (org.junit.Test)123 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)86 DataflowPipelineOptions (org.apache.beam.runners.dataflow.options.DataflowPipelineOptions)39 KV (org.apache.beam.sdk.values.KV)35 Job (com.google.api.services.dataflow.model.Job)26 DoFn (org.apache.beam.sdk.transforms.DoFn)24 PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)22 DataflowPackage (com.google.api.services.dataflow.model.DataflowPackage)21 TableRow (com.google.api.services.bigquery.model.TableRow)16 PipelineResult (org.apache.beam.sdk.PipelineResult)14 Structs.getString (org.apache.beam.runners.dataflow.util.Structs.getString)13 TableSchema (com.google.api.services.bigquery.model.TableSchema)12 ApexPipelineOptions (org.apache.beam.runners.apex.ApexPipelineOptions)12 Map (java.util.Map)11 TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)10 ArrayList (java.util.ArrayList)10 Instant (org.joda.time.Instant)10 TableReference (com.google.api.services.bigquery.model.TableReference)9 JsonSchemaToTableSchema (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema)9