Search in sources :

Example 36 with TableFieldSchema

use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.

the class BigQueryIOReadTest method checkSetsProject.

private void checkSetsProject(String projectId) throws Exception {
    fakeDatasetService.createDataset(projectId, "dataset-id", "", "", null);
    String tableId = "sometable";
    TableReference tableReference = new TableReference().setProjectId(projectId).setDatasetId("dataset-id").setTableId(tableId);
    fakeDatasetService.createTable(new Table().setTableReference(tableReference).setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))));
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(fakeDatasetService);
    List<TableRow> expected = ImmutableList.of(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L), new TableRow().set("name", "d").set("number", 4L), new TableRow().set("name", "e").set("number", 5L), new TableRow().set("name", "f").set("number", 6L));
    fakeDatasetService.insertAll(tableReference, expected, null);
    TableReference tableRef = new TableReference().setDatasetId("dataset-id").setTableId(tableId);
    PCollection<KV<String, Long>> output = p.apply(BigQueryIO.read().from(tableRef).withTestServices(fakeBqServices)).apply(ParDo.of(new DoFn<TableRow, KV<String, Long>>() {

        @ProcessElement
        public void processElement(ProcessContext c) throws Exception {
            c.output(KV.of((String) c.element().get("name"), Long.valueOf((String) c.element().get("number"))));
        }
    }));
    PAssert.that(output).containsInAnyOrder(ImmutableList.of(KV.of("a", 1L), KV.of("b", 2L), KV.of("c", 3L), KV.of("d", 4L), KV.of("e", 5L), KV.of("f", 6L)));
    p.run();
}
Also used : Table(com.google.api.services.bigquery.model.Table) TableSchema(com.google.api.services.bigquery.model.TableSchema) ByteString(com.google.protobuf.ByteString) KV(org.apache.beam.sdk.values.KV) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) TableReference(com.google.api.services.bigquery.model.TableReference) BigQueryResourceNaming.createTempTableReference(org.apache.beam.sdk.io.gcp.bigquery.BigQueryResourceNaming.createTempTableReference) DoFn(org.apache.beam.sdk.transforms.DoFn) FakeJobService(org.apache.beam.sdk.io.gcp.testing.FakeJobService) TableRow(com.google.api.services.bigquery.model.TableRow) FakeBigQueryServices(org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices)

Example 37 with TableFieldSchema

use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.

the class BigQueryIOReadTest method testReadTableWithSchema.

@Test
public void testReadTableWithSchema() throws IOException, InterruptedException {
    // setup
    Table someTable = new Table();
    someTable.setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER"))));
    someTable.setTableReference(new TableReference().setProjectId("non-executing-project").setDatasetId("schema_dataset").setTableId("schema_table"));
    someTable.setNumBytes(1024L * 1024L);
    FakeDatasetService fakeDatasetService = new FakeDatasetService();
    fakeDatasetService.createDataset("non-executing-project", "schema_dataset", "", "", null);
    fakeDatasetService.createTable(someTable);
    List<TableRow> records = Lists.newArrayList(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L));
    fakeDatasetService.insertAll(someTable.getTableReference(), records, null);
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(fakeDatasetService);
    // test
    BigQueryIO.TypedRead<TableRow> read = BigQueryIO.readTableRowsWithSchema().from("non-executing-project:schema_dataset.schema_table").withTestServices(fakeBqServices).withoutValidation();
    PCollection<TableRow> bqRows = p.apply(read);
    Schema expectedSchema = Schema.of(Schema.Field.of("name", Schema.FieldType.STRING).withNullable(true), Schema.Field.of("number", Schema.FieldType.INT64).withNullable(true));
    assertEquals(expectedSchema, bqRows.getSchema());
    PCollection<Row> output = bqRows.apply(Select.fieldNames("name", "number"));
    PAssert.that(output).containsInAnyOrder(ImmutableList.of(Row.withSchema(expectedSchema).addValues("a", 1L).build(), Row.withSchema(expectedSchema).addValues("b", 2L).build(), Row.withSchema(expectedSchema).addValues("c", 3L).build()));
    p.run();
}
Also used : Table(com.google.api.services.bigquery.model.Table) TableSchema(com.google.api.services.bigquery.model.TableSchema) TableSchema(com.google.api.services.bigquery.model.TableSchema) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) Schema(org.apache.beam.sdk.schemas.Schema) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) TableReference(com.google.api.services.bigquery.model.TableReference) BigQueryResourceNaming.createTempTableReference(org.apache.beam.sdk.io.gcp.bigquery.BigQueryResourceNaming.createTempTableReference) FakeDatasetService(org.apache.beam.sdk.io.gcp.testing.FakeDatasetService) FakeJobService(org.apache.beam.sdk.io.gcp.testing.FakeJobService) TableRow(com.google.api.services.bigquery.model.TableRow) FakeBigQueryServices(org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices) TableRow(com.google.api.services.bigquery.model.TableRow) Row(org.apache.beam.sdk.values.Row) Test(org.junit.Test)

Example 38 with TableFieldSchema

use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.

the class BigQueryIOReadTest method testReadFromTable.

private void testReadFromTable(boolean useTemplateCompatibility, boolean useReadTableRows) throws IOException, InterruptedException {
    Table sometable = new Table();
    sometable.setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER"))));
    sometable.setTableReference(new TableReference().setProjectId("non-executing-project").setDatasetId("somedataset").setTableId("sometable"));
    sometable.setNumBytes(1024L * 1024L);
    FakeDatasetService fakeDatasetService = new FakeDatasetService();
    fakeDatasetService.createDataset("non-executing-project", "somedataset", "", "", null);
    fakeDatasetService.createTable(sometable);
    List<TableRow> records = Lists.newArrayList(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L));
    fakeDatasetService.insertAll(sometable.getTableReference(), records, null);
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(fakeDatasetService);
    PTransform<PBegin, PCollection<TableRow>> readTransform;
    if (useReadTableRows) {
        BigQueryIO.Read read = BigQueryIO.read().from("non-executing-project:somedataset.sometable").withTestServices(fakeBqServices).withoutValidation();
        readTransform = useTemplateCompatibility ? read.withTemplateCompatibility() : read;
    } else {
        BigQueryIO.TypedRead<TableRow> read = BigQueryIO.readTableRows().from("non-executing-project:somedataset.sometable").withTestServices(fakeBqServices).withoutValidation();
        readTransform = useTemplateCompatibility ? read.withTemplateCompatibility() : read;
    }
    PCollection<KV<String, Long>> output = p.apply(readTransform).apply(ParDo.of(new DoFn<TableRow, KV<String, Long>>() {

        @ProcessElement
        public void processElement(ProcessContext c) throws Exception {
            c.output(KV.of((String) c.element().get("name"), Long.valueOf((String) c.element().get("number"))));
        }
    }));
    PAssert.that(output).containsInAnyOrder(ImmutableList.of(KV.of("a", 1L), KV.of("b", 2L), KV.of("c", 3L)));
    p.run();
}
Also used : Table(com.google.api.services.bigquery.model.Table) TableSchema(com.google.api.services.bigquery.model.TableSchema) KV(org.apache.beam.sdk.values.KV) ByteString(com.google.protobuf.ByteString) PBegin(org.apache.beam.sdk.values.PBegin) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) PCollection(org.apache.beam.sdk.values.PCollection) TableReference(com.google.api.services.bigquery.model.TableReference) BigQueryResourceNaming.createTempTableReference(org.apache.beam.sdk.io.gcp.bigquery.BigQueryResourceNaming.createTempTableReference) FakeDatasetService(org.apache.beam.sdk.io.gcp.testing.FakeDatasetService) DoFn(org.apache.beam.sdk.transforms.DoFn) FakeJobService(org.apache.beam.sdk.io.gcp.testing.FakeJobService) TableRow(com.google.api.services.bigquery.model.TableRow) FakeBigQueryServices(org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices)

Example 39 with TableFieldSchema

use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.

the class BigQueryHllSketchCompatibilityIT method writeSketchToBigQuery.

private void writeSketchToBigQuery(List<String> testData, String expectedChecksum) {
    String tableSpec = String.format("%s.%s", DATASET_ID, SKETCH_TABLE_ID);
    String query = String.format("SELECT HLL_COUNT.EXTRACT(%s) FROM %s", SKETCH_FIELD_NAME, tableSpec);
    TableSchema tableSchema = new TableSchema().setFields(Collections.singletonList(new TableFieldSchema().setName(SKETCH_FIELD_NAME).setType(SKETCH_FIELD_TYPE)));
    TestPipelineOptions options = TestPipeline.testingPipelineOptions().as(TestPipelineOptions.class);
    Pipeline p = Pipeline.create(options);
    // until we have a stub class for BigQuery TableRow
    @SuppressWarnings("nullness") SerializableFunction<byte[], TableRow> formatFn = sketch -> new TableRow().set(SKETCH_FIELD_NAME, sketch.length == 0 ? null : sketch);
    p.apply(Create.of(testData).withType(TypeDescriptor.of(String.class))).apply(HllCount.Init.forStrings().globally()).apply(BigQueryIO.<byte[]>write().to(tableSpec).withSchema(tableSchema).withFormatFunction(formatFn).withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
    p.run().waitUntilFinish();
    // BigqueryMatcher will send a query to retrieve the estimated count and verifies its
    // correctness using checksum.
    assertThat(createQueryUsingStandardSql(APP_NAME, PROJECT_ID, query), queryResultHasChecksum(expectedChecksum));
}
Also used : Arrays(java.util.Arrays) TestPipelineOptions(org.apache.beam.sdk.testing.TestPipelineOptions) TypeDescriptor(org.apache.beam.sdk.values.TypeDescriptor) BeforeClass(org.junit.BeforeClass) BigqueryMatcher.queryResultHasChecksum(org.apache.beam.sdk.io.gcp.testing.BigqueryMatcher.queryResultHasChecksum) Date(java.util.Date) RunWith(org.junit.runner.RunWith) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) SchemaAndRecord(org.apache.beam.sdk.io.gcp.bigquery.SchemaAndRecord) ByteBuffer(java.nio.ByteBuffer) Create(org.apache.beam.sdk.transforms.Create) Map(java.util.Map) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) TableRow(com.google.api.services.bigquery.model.TableRow) TableSchema(com.google.api.services.bigquery.model.TableSchema) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) Pipeline(org.apache.beam.sdk.Pipeline) BigqueryClient(org.apache.beam.sdk.io.gcp.testing.BigqueryClient) TableReference(com.google.api.services.bigquery.model.TableReference) AfterClass(org.junit.AfterClass) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) PAssert(org.apache.beam.sdk.testing.PAssert) BigQueryIO(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO) ApplicationNameOptions(org.apache.beam.sdk.options.ApplicationNameOptions) Method(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.TypedRead.Method) Test(org.junit.Test) JUnit4(org.junit.runners.JUnit4) PCollection(org.apache.beam.sdk.values.PCollection) Collectors(java.util.stream.Collectors) Table(com.google.api.services.bigquery.model.Table) DataFormat(com.google.cloud.bigquery.storage.v1.DataFormat) List(java.util.List) ByteArrayCoder(org.apache.beam.sdk.coders.ByteArrayCoder) BigqueryMatcher.createQueryUsingStandardSql(org.apache.beam.sdk.io.gcp.testing.BigqueryMatcher.createQueryUsingStandardSql) Collections(java.util.Collections) TableSchema(com.google.api.services.bigquery.model.TableSchema) TableRow(com.google.api.services.bigquery.model.TableRow) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) TestPipelineOptions(org.apache.beam.sdk.testing.TestPipelineOptions) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline)

Example 40 with TableFieldSchema

use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.

the class BigQueryTornadoes method applyBigQueryTornadoes.

public static void applyBigQueryTornadoes(Pipeline p, Options options) {
    // Build the table schema for the output table.
    List<TableFieldSchema> fields = new ArrayList<>();
    fields.add(new TableFieldSchema().setName("month").setType("INTEGER"));
    fields.add(new TableFieldSchema().setName("tornado_count").setType("INTEGER"));
    TableSchema schema = new TableSchema().setFields(fields);
    TypedRead<TableRow> bigqueryIO;
    if (!options.getInputQuery().isEmpty()) {
        bigqueryIO = BigQueryIO.readTableRows().fromQuery(options.getInputQuery()).usingStandardSql().withMethod(options.getReadMethod());
    } else {
        bigqueryIO = BigQueryIO.readTableRows().from(options.getInput()).withMethod(options.getReadMethod());
        // when reading directly from a table.
        if (options.getReadMethod() == TypedRead.Method.DIRECT_READ) {
            bigqueryIO = bigqueryIO.withSelectedFields(Lists.newArrayList("month", "tornado"));
        }
    }
    PCollection<TableRow> rowsFromBigQuery = p.apply(bigqueryIO);
    rowsFromBigQuery.apply(new CountTornadoes()).apply(BigQueryIO.writeTableRows().to(options.getOutput()).withSchema(schema).withCreateDisposition(options.getCreateDisposition()).withWriteDisposition(options.getWriteDisposition()).withMethod(options.getWriteMethod()));
}
Also used : TableSchema(com.google.api.services.bigquery.model.TableSchema) TableRow(com.google.api.services.bigquery.model.TableRow) ArrayList(java.util.ArrayList) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema)

Aggregations

TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)80 TableSchema (com.google.api.services.bigquery.model.TableSchema)71 TableRow (com.google.api.services.bigquery.model.TableRow)56 Test (org.junit.Test)45 Table (com.google.api.services.bigquery.model.Table)25 TableReference (com.google.api.services.bigquery.model.TableReference)23 ArrayList (java.util.ArrayList)17 BigQueryHelpers.toJsonString (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString)16 List (java.util.List)15 Map (java.util.Map)15 PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)14 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)13 Pipeline (org.apache.beam.sdk.Pipeline)12 ByteString (com.google.protobuf.ByteString)10 JsonSchemaToTableSchema (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema)10 Write (org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write)10 Method (org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method)10 BigQueryResourceNaming.createTempTableReference (org.apache.beam.sdk.io.gcp.bigquery.BigQueryResourceNaming.createTempTableReference)9 FakeBigQueryServices (org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices)9 ErrorProto (com.google.api.services.bigquery.model.ErrorProto)8