Search in sources :

Example 1 with TableFieldSchema

use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.

the class BigQueryIOTest method testBigQueryQuerySourceInitSplit.

@Test
public void testBigQueryQuerySourceInitSplit() throws Exception {
    TableReference dryRunTable = new TableReference();
    Job queryJob = new Job();
    JobStatistics queryJobStats = new JobStatistics();
    JobStatistics2 queryStats = new JobStatistics2();
    queryStats.setReferencedTables(ImmutableList.of(dryRunTable));
    queryJobStats.setQuery(queryStats);
    queryJob.setStatus(new JobStatus()).setStatistics(queryJobStats);
    Job extractJob = new Job();
    JobStatistics extractJobStats = new JobStatistics();
    JobStatistics4 extractStats = new JobStatistics4();
    extractStats.setDestinationUriFileCounts(ImmutableList.of(1L));
    extractJobStats.setExtract(extractStats);
    extractJob.setStatus(new JobStatus()).setStatistics(extractJobStats);
    FakeJobService fakeJobService = new FakeJobService();
    FakeDatasetService fakeDatasetService = new FakeDatasetService();
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(fakeJobService).withDatasetService(fakeDatasetService);
    List<TableRow> expected = ImmutableList.of(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L), new TableRow().set("name", "d").set("number", 4L), new TableRow().set("name", "e").set("number", 5L), new TableRow().set("name", "f").set("number", 6L));
    PipelineOptions options = PipelineOptionsFactory.create();
    BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
    bqOptions.setProject("project");
    String stepUuid = "testStepUuid";
    TableReference tempTableReference = createTempTableReference(bqOptions.getProject(), createJobIdToken(bqOptions.getJobName(), stepUuid));
    fakeDatasetService.createDataset(bqOptions.getProject(), tempTableReference.getDatasetId(), "", "");
    fakeDatasetService.createTable(new Table().setTableReference(tempTableReference).setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))));
    Path baseDir = Files.createTempDirectory(tempFolder, "testBigQueryQuerySourceInitSplit");
    String query = FakeBigQueryServices.encodeQuery(expected);
    BoundedSource<TableRow> bqSource = BigQueryQuerySource.create(stepUuid, StaticValueProvider.of(query), true, /* flattenResults */
    true, /* useLegacySql */
    fakeBqServices);
    options.setTempLocation(baseDir.toString());
    TableReference queryTable = new TableReference().setProjectId(bqOptions.getProject()).setDatasetId(tempTableReference.getDatasetId()).setTableId(tempTableReference.getTableId());
    fakeJobService.expectDryRunQuery(bqOptions.getProject(), query, new JobStatistics().setQuery(new JobStatistics2().setTotalBytesProcessed(100L).setReferencedTables(ImmutableList.of(queryTable))));
    List<TableRow> read = SourceTestUtils.readFromSource(bqSource, options);
    assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));
    SourceTestUtils.assertSplitAtFractionBehavior(bqSource, 2, 0.3, ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
    List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
    assertEquals(2, sources.size());
    BoundedSource<TableRow> actual = sources.get(0);
    assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
}
Also used : Path(java.nio.file.Path) JobStatistics(com.google.api.services.bigquery.model.JobStatistics) JobStatistics2(com.google.api.services.bigquery.model.JobStatistics2) HashBasedTable(com.google.common.collect.HashBasedTable) Table(com.google.api.services.bigquery.model.Table) JobStatistics4(com.google.api.services.bigquery.model.JobStatistics4) TableSchema(com.google.api.services.bigquery.model.TableSchema) JsonSchemaToTableSchema(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) JobStatus(com.google.api.services.bigquery.model.JobStatus) BigQueryHelpers.createTempTableReference(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference) TableReference(com.google.api.services.bigquery.model.TableReference) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) TableRow(com.google.api.services.bigquery.model.TableRow) Job(com.google.api.services.bigquery.model.Job) Test(org.junit.Test)

Example 2 with TableFieldSchema

use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.

the class BigQueryIOTest method testBigQueryTableSourceInitSplit.

@Test
public void testBigQueryTableSourceInitSplit() throws Exception {
    FakeDatasetService fakeDatasetService = new FakeDatasetService();
    FakeJobService fakeJobService = new FakeJobService();
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(fakeJobService).withDatasetService(fakeDatasetService);
    List<TableRow> expected = ImmutableList.of(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L), new TableRow().set("name", "d").set("number", 4L), new TableRow().set("name", "e").set("number", 5L), new TableRow().set("name", "f").set("number", 6L));
    TableReference table = BigQueryHelpers.parseTableSpec("project:data_set.table_name");
    fakeDatasetService.createDataset("project", "data_set", "", "");
    fakeDatasetService.createTable(new Table().setTableReference(table).setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))));
    fakeDatasetService.insertAll(table, expected, null);
    Path baseDir = Files.createTempDirectory(tempFolder, "testBigQueryTableSourceInitSplit");
    String stepUuid = "testStepUuid";
    BoundedSource<TableRow> bqSource = BigQueryTableSource.create(stepUuid, StaticValueProvider.of(table), fakeBqServices);
    PipelineOptions options = PipelineOptionsFactory.create();
    options.setTempLocation(baseDir.toString());
    BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
    bqOptions.setProject("project");
    List<TableRow> read = SourceTestUtils.readFromSource(bqSource, options);
    assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));
    SourceTestUtils.assertSplitAtFractionBehavior(bqSource, 2, 0.3, ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
    List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
    assertEquals(2, sources.size());
    // Simulate a repeated call to split(), like a Dataflow worker will sometimes do.
    sources = bqSource.split(200, options);
    assertEquals(2, sources.size());
    BoundedSource<TableRow> actual = sources.get(0);
    assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
    // A repeated call to split() should not have caused a duplicate extract job.
    assertEquals(1, fakeJobService.getNumExtractJobCalls());
}
Also used : Path(java.nio.file.Path) HashBasedTable(com.google.common.collect.HashBasedTable) Table(com.google.api.services.bigquery.model.Table) TableSchema(com.google.api.services.bigquery.model.TableSchema) JsonSchemaToTableSchema(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) BigQueryHelpers.createTempTableReference(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference) TableReference(com.google.api.services.bigquery.model.TableReference) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) TableRow(com.google.api.services.bigquery.model.TableRow) Test(org.junit.Test)

Example 3 with TableFieldSchema

use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.

the class BigQueryIOTest method testRetryPolicy.

@Test
public void testRetryPolicy() throws Exception {
    BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
    bqOptions.setProject("project-id");
    bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
    FakeDatasetService datasetService = new FakeDatasetService();
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(datasetService);
    datasetService.createDataset("project-id", "dataset-id", "", "");
    TableRow row1 = new TableRow().set("name", "a").set("number", "1");
    TableRow row2 = new TableRow().set("name", "b").set("number", "2");
    TableRow row3 = new TableRow().set("name", "c").set("number", "3");
    TableDataInsertAllResponse.InsertErrors ephemeralError = new TableDataInsertAllResponse.InsertErrors().setErrors(ImmutableList.of(new ErrorProto().setReason("timeout")));
    TableDataInsertAllResponse.InsertErrors persistentError = new TableDataInsertAllResponse.InsertErrors().setErrors(ImmutableList.of(new ErrorProto().setReason("invalidQuery")));
    datasetService.failOnInsert(ImmutableMap.<TableRow, List<TableDataInsertAllResponse.InsertErrors>>of(row1, ImmutableList.of(ephemeralError, ephemeralError), row2, ImmutableList.of(ephemeralError, ephemeralError, persistentError)));
    Pipeline p = TestPipeline.create(bqOptions);
    PCollection<TableRow> failedRows = p.apply(Create.of(row1, row2, row3)).setIsBoundedInternal(IsBounded.UNBOUNDED).apply(BigQueryIO.writeTableRows().to("project-id:dataset-id.table-id").withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED).withSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()).withTestServices(fakeBqServices).withoutValidation()).getFailedInserts();
    // row2 finally fails with a non-retryable error, so we expect to see it in the collection of
    // failed rows.
    PAssert.that(failedRows).containsInAnyOrder(row2);
    p.run();
    // Only row1 and row3 were successfully inserted.
    assertThat(datasetService.getAllRows("project-id", "dataset-id", "table-id"), containsInAnyOrder(row1, row3));
}
Also used : ErrorProto(com.google.api.services.bigquery.model.ErrorProto) TableSchema(com.google.api.services.bigquery.model.TableSchema) JsonSchemaToTableSchema(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema) TableDataInsertAllResponse(com.google.api.services.bigquery.model.TableDataInsertAllResponse) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) TableRow(com.google.api.services.bigquery.model.TableRow) Test(org.junit.Test)

Example 4 with TableFieldSchema

use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.

the class BigQueryIOTest method testBigQueryNoTableQuerySourceInitSplit.

@Test
public void testBigQueryNoTableQuerySourceInitSplit() throws Exception {
    TableReference dryRunTable = new TableReference();
    Job queryJob = new Job();
    JobStatistics queryJobStats = new JobStatistics();
    JobStatistics2 queryStats = new JobStatistics2();
    queryStats.setReferencedTables(ImmutableList.of(dryRunTable));
    queryJobStats.setQuery(queryStats);
    queryJob.setStatus(new JobStatus()).setStatistics(queryJobStats);
    Job extractJob = new Job();
    JobStatistics extractJobStats = new JobStatistics();
    JobStatistics4 extractStats = new JobStatistics4();
    extractStats.setDestinationUriFileCounts(ImmutableList.of(1L));
    extractJobStats.setExtract(extractStats);
    extractJob.setStatus(new JobStatus()).setStatistics(extractJobStats);
    FakeDatasetService datasetService = new FakeDatasetService();
    FakeJobService jobService = new FakeJobService();
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(jobService).withDatasetService(datasetService);
    PipelineOptions options = PipelineOptionsFactory.create();
    BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
    bqOptions.setProject("project");
    String stepUuid = "testStepUuid";
    TableReference tempTableReference = createTempTableReference(bqOptions.getProject(), createJobIdToken(bqOptions.getJobName(), stepUuid));
    List<TableRow> expected = ImmutableList.of(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L), new TableRow().set("name", "d").set("number", 4L), new TableRow().set("name", "e").set("number", 5L), new TableRow().set("name", "f").set("number", 6L));
    datasetService.createDataset(tempTableReference.getProjectId(), tempTableReference.getDatasetId(), "", "");
    Table table = new Table().setTableReference(tempTableReference).setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER"))));
    datasetService.createTable(table);
    String query = FakeBigQueryServices.encodeQuery(expected);
    jobService.expectDryRunQuery("project", query, new JobStatistics().setQuery(new JobStatistics2().setTotalBytesProcessed(100L).setReferencedTables(ImmutableList.of(table.getTableReference()))));
    Path baseDir = Files.createTempDirectory(tempFolder, "testBigQueryNoTableQuerySourceInitSplit");
    BoundedSource<TableRow> bqSource = BigQueryQuerySource.create(stepUuid, StaticValueProvider.of(query), true, /* flattenResults */
    true, /* useLegacySql */
    fakeBqServices);
    options.setTempLocation(baseDir.toString());
    List<TableRow> read = convertBigDecimaslToLong(SourceTestUtils.readFromSource(bqSource, options));
    assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));
    SourceTestUtils.assertSplitAtFractionBehavior(bqSource, 2, 0.3, ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
    List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
    assertEquals(2, sources.size());
    BoundedSource<TableRow> actual = sources.get(0);
    assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
}
Also used : Path(java.nio.file.Path) JobStatistics(com.google.api.services.bigquery.model.JobStatistics) JobStatistics2(com.google.api.services.bigquery.model.JobStatistics2) HashBasedTable(com.google.common.collect.HashBasedTable) Table(com.google.api.services.bigquery.model.Table) JobStatistics4(com.google.api.services.bigquery.model.JobStatistics4) TableSchema(com.google.api.services.bigquery.model.TableSchema) JsonSchemaToTableSchema(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) JobStatus(com.google.api.services.bigquery.model.JobStatus) BigQueryHelpers.createTempTableReference(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference) TableReference(com.google.api.services.bigquery.model.TableReference) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) TableRow(com.google.api.services.bigquery.model.TableRow) Job(com.google.api.services.bigquery.model.Job) Test(org.junit.Test)

Example 5 with TableFieldSchema

use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.

the class BigQueryIOTest method testValidateReadSetsDefaultProject.

@Test
public void testValidateReadSetsDefaultProject() throws Exception {
    String projectId = "someproject";
    String datasetId = "somedataset";
    String tableId = "sometable";
    BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
    bqOptions.setProject(projectId);
    Path baseDir = Files.createTempDirectory(tempFolder, "testValidateReadSetsDefaultProject");
    bqOptions.setTempLocation(baseDir.toString());
    FakeDatasetService fakeDatasetService = new FakeDatasetService();
    fakeDatasetService.createDataset(projectId, datasetId, "", "");
    TableReference tableReference = new TableReference().setProjectId(projectId).setDatasetId(datasetId).setTableId(tableId);
    fakeDatasetService.createTable(new Table().setTableReference(tableReference).setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))));
    FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(fakeDatasetService);
    List<TableRow> expected = ImmutableList.of(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L), new TableRow().set("name", "d").set("number", 4L), new TableRow().set("name", "e").set("number", 5L), new TableRow().set("name", "f").set("number", 6L));
    fakeDatasetService.insertAll(tableReference, expected, null);
    Pipeline p = TestPipeline.create(bqOptions);
    TableReference tableRef = new TableReference();
    tableRef.setDatasetId(datasetId);
    tableRef.setTableId(tableId);
    PCollection<KV<String, Long>> output = p.apply(BigQueryIO.read().from(tableRef).withTestServices(fakeBqServices)).apply(ParDo.of(new DoFn<TableRow, KV<String, Long>>() {

        @ProcessElement
        public void processElement(ProcessContext c) throws Exception {
            c.output(KV.of((String) c.element().get("name"), Long.valueOf((String) c.element().get("number"))));
        }
    }));
    PAssert.that(output).containsInAnyOrder(ImmutableList.of(KV.of("a", 1L), KV.of("b", 2L), KV.of("c", 3L), KV.of("d", 4L), KV.of("e", 5L), KV.of("f", 6L)));
    p.run();
}
Also used : Path(java.nio.file.Path) HashBasedTable(com.google.common.collect.HashBasedTable) Table(com.google.api.services.bigquery.model.Table) TableSchema(com.google.api.services.bigquery.model.TableSchema) JsonSchemaToTableSchema(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema) BigQueryHelpers.toJsonString(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString) KV(org.apache.beam.sdk.values.KV) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline) BigQueryHelpers.createTempTableReference(org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.createTempTableReference) TableReference(com.google.api.services.bigquery.model.TableReference) DoFn(org.apache.beam.sdk.transforms.DoFn) TableRow(com.google.api.services.bigquery.model.TableRow) Test(org.junit.Test)

Aggregations

TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)80 TableSchema (com.google.api.services.bigquery.model.TableSchema)71 TableRow (com.google.api.services.bigquery.model.TableRow)56 Test (org.junit.Test)45 Table (com.google.api.services.bigquery.model.Table)25 TableReference (com.google.api.services.bigquery.model.TableReference)23 ArrayList (java.util.ArrayList)17 BigQueryHelpers.toJsonString (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.toJsonString)16 List (java.util.List)15 Map (java.util.Map)15 PipelineOptions (org.apache.beam.sdk.options.PipelineOptions)14 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)13 Pipeline (org.apache.beam.sdk.Pipeline)12 ByteString (com.google.protobuf.ByteString)10 JsonSchemaToTableSchema (org.apache.beam.sdk.io.gcp.bigquery.BigQueryHelpers.JsonSchemaToTableSchema)10 Write (org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write)10 Method (org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write.Method)10 BigQueryResourceNaming.createTempTableReference (org.apache.beam.sdk.io.gcp.bigquery.BigQueryResourceNaming.createTempTableReference)9 FakeBigQueryServices (org.apache.beam.sdk.io.gcp.testing.FakeBigQueryServices)9 ErrorProto (com.google.api.services.bigquery.model.ErrorProto)8