use of com.google.api.services.bigquery.model.TableSchema in project beam by apache.
the class FilterExamples method main.
public static void main(String[] args) throws Exception {
Options options = PipelineOptionsFactory.fromArgs(args).withValidation().as(Options.class);
Pipeline p = Pipeline.create(options);
TableSchema schema = buildWeatherSchemaProjection();
p.apply(BigQueryIO.read().from(options.getInput())).apply(ParDo.of(new ProjectionFn())).apply(new BelowGlobalMean(options.getMonthFilter())).apply(BigQueryIO.writeTableRows().to(options.getOutput()).withSchema(schema).withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).withWriteDisposition(BigQueryIO.Write.WriteDisposition.WRITE_TRUNCATE));
p.run().waitUntilFinish();
}
use of com.google.api.services.bigquery.model.TableSchema in project beam by apache.
the class BigQueryAvroUtilsTest method testConvertGenericRecordToTableRow.
@Test
public void testConvertGenericRecordToTableRow() throws Exception {
TableSchema tableSchema = new TableSchema();
tableSchema.setFields(fields);
Schema avroSchema = AvroCoder.of(Bird.class).getSchema();
{
// Test nullable fields.
GenericRecord record = new GenericData.Record(avroSchema);
record.put("number", 5L);
TableRow convertedRow = BigQueryAvroUtils.convertGenericRecordToTableRow(record, tableSchema);
TableRow row = new TableRow().set("number", "5").set("associates", new ArrayList<TableRow>());
assertEquals(row, convertedRow);
}
{
// Test type conversion for:
// INTEGER, FLOAT, TIMESTAMP, BOOLEAN, BYTES, DATE, DATETIME, TIME.
GenericRecord record = new GenericData.Record(avroSchema);
byte[] soundBytes = "chirp,chirp".getBytes();
ByteBuffer soundByteBuffer = ByteBuffer.wrap(soundBytes);
soundByteBuffer.rewind();
record.put("number", 5L);
record.put("quality", 5.0);
record.put("birthday", 5L);
record.put("flighted", Boolean.TRUE);
record.put("sound", soundByteBuffer);
record.put("anniversaryDate", new Utf8("2000-01-01"));
record.put("anniversaryDatetime", new String("2000-01-01 00:00:00.000005"));
record.put("anniversaryTime", new Utf8("00:00:00.000005"));
TableRow convertedRow = BigQueryAvroUtils.convertGenericRecordToTableRow(record, tableSchema);
TableRow row = new TableRow().set("number", "5").set("birthday", "1970-01-01 00:00:00.000005 UTC").set("quality", 5.0).set("associates", new ArrayList<TableRow>()).set("flighted", Boolean.TRUE).set("sound", BaseEncoding.base64().encode(soundBytes)).set("anniversaryDate", "2000-01-01").set("anniversaryDatetime", "2000-01-01 00:00:00.000005").set("anniversaryTime", "00:00:00.000005");
assertEquals(row, convertedRow);
}
{
// Test repeated fields.
Schema subBirdSchema = AvroCoder.of(Bird.SubBird.class).getSchema();
GenericRecord nestedRecord = new GenericData.Record(subBirdSchema);
nestedRecord.put("species", "other");
GenericRecord record = new GenericData.Record(avroSchema);
record.put("number", 5L);
record.put("associates", Lists.<GenericRecord>newArrayList(nestedRecord));
TableRow convertedRow = BigQueryAvroUtils.convertGenericRecordToTableRow(record, tableSchema);
TableRow row = new TableRow().set("associates", Lists.<TableRow>newArrayList(new TableRow().set("species", "other"))).set("number", "5");
assertEquals(row, convertedRow);
}
}
use of com.google.api.services.bigquery.model.TableSchema in project beam by apache.
the class BigQueryIOTest method testBigQueryQuerySourceInitSplit.
@Test
public void testBigQueryQuerySourceInitSplit() throws Exception {
TableReference dryRunTable = new TableReference();
Job queryJob = new Job();
JobStatistics queryJobStats = new JobStatistics();
JobStatistics2 queryStats = new JobStatistics2();
queryStats.setReferencedTables(ImmutableList.of(dryRunTable));
queryJobStats.setQuery(queryStats);
queryJob.setStatus(new JobStatus()).setStatistics(queryJobStats);
Job extractJob = new Job();
JobStatistics extractJobStats = new JobStatistics();
JobStatistics4 extractStats = new JobStatistics4();
extractStats.setDestinationUriFileCounts(ImmutableList.of(1L));
extractJobStats.setExtract(extractStats);
extractJob.setStatus(new JobStatus()).setStatistics(extractJobStats);
FakeJobService fakeJobService = new FakeJobService();
FakeDatasetService fakeDatasetService = new FakeDatasetService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(fakeJobService).withDatasetService(fakeDatasetService);
List<TableRow> expected = ImmutableList.of(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L), new TableRow().set("name", "d").set("number", 4L), new TableRow().set("name", "e").set("number", 5L), new TableRow().set("name", "f").set("number", 6L));
PipelineOptions options = PipelineOptionsFactory.create();
BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
bqOptions.setProject("project");
String stepUuid = "testStepUuid";
TableReference tempTableReference = createTempTableReference(bqOptions.getProject(), createJobIdToken(bqOptions.getJobName(), stepUuid));
fakeDatasetService.createDataset(bqOptions.getProject(), tempTableReference.getDatasetId(), "", "");
fakeDatasetService.createTable(new Table().setTableReference(tempTableReference).setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))));
Path baseDir = Files.createTempDirectory(tempFolder, "testBigQueryQuerySourceInitSplit");
String query = FakeBigQueryServices.encodeQuery(expected);
BoundedSource<TableRow> bqSource = BigQueryQuerySource.create(stepUuid, StaticValueProvider.of(query), true, /* flattenResults */
true, /* useLegacySql */
fakeBqServices);
options.setTempLocation(baseDir.toString());
TableReference queryTable = new TableReference().setProjectId(bqOptions.getProject()).setDatasetId(tempTableReference.getDatasetId()).setTableId(tempTableReference.getTableId());
fakeJobService.expectDryRunQuery(bqOptions.getProject(), query, new JobStatistics().setQuery(new JobStatistics2().setTotalBytesProcessed(100L).setReferencedTables(ImmutableList.of(queryTable))));
List<TableRow> read = SourceTestUtils.readFromSource(bqSource, options);
assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));
SourceTestUtils.assertSplitAtFractionBehavior(bqSource, 2, 0.3, ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
assertEquals(2, sources.size());
BoundedSource<TableRow> actual = sources.get(0);
assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
}
use of com.google.api.services.bigquery.model.TableSchema in project beam by apache.
the class BigQueryIOTest method testBigQueryTableSourceInitSplit.
@Test
public void testBigQueryTableSourceInitSplit() throws Exception {
FakeDatasetService fakeDatasetService = new FakeDatasetService();
FakeJobService fakeJobService = new FakeJobService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(fakeJobService).withDatasetService(fakeDatasetService);
List<TableRow> expected = ImmutableList.of(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L), new TableRow().set("name", "d").set("number", 4L), new TableRow().set("name", "e").set("number", 5L), new TableRow().set("name", "f").set("number", 6L));
TableReference table = BigQueryHelpers.parseTableSpec("project:data_set.table_name");
fakeDatasetService.createDataset("project", "data_set", "", "");
fakeDatasetService.createTable(new Table().setTableReference(table).setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))));
fakeDatasetService.insertAll(table, expected, null);
Path baseDir = Files.createTempDirectory(tempFolder, "testBigQueryTableSourceInitSplit");
String stepUuid = "testStepUuid";
BoundedSource<TableRow> bqSource = BigQueryTableSource.create(stepUuid, StaticValueProvider.of(table), fakeBqServices);
PipelineOptions options = PipelineOptionsFactory.create();
options.setTempLocation(baseDir.toString());
BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
bqOptions.setProject("project");
List<TableRow> read = SourceTestUtils.readFromSource(bqSource, options);
assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));
SourceTestUtils.assertSplitAtFractionBehavior(bqSource, 2, 0.3, ExpectedSplitOutcome.MUST_BE_CONSISTENT_IF_SUCCEEDS, options);
List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
assertEquals(2, sources.size());
// Simulate a repeated call to split(), like a Dataflow worker will sometimes do.
sources = bqSource.split(200, options);
assertEquals(2, sources.size());
BoundedSource<TableRow> actual = sources.get(0);
assertThat(actual, CoreMatchers.instanceOf(TransformingSource.class));
// A repeated call to split() should not have caused a duplicate extract job.
assertEquals(1, fakeJobService.getNumExtractJobCalls());
}
use of com.google.api.services.bigquery.model.TableSchema in project beam by apache.
the class BigQueryIOTest method testRetryPolicy.
@Test
public void testRetryPolicy() throws Exception {
BigQueryOptions bqOptions = TestPipeline.testingPipelineOptions().as(BigQueryOptions.class);
bqOptions.setProject("project-id");
bqOptions.setTempLocation(testFolder.newFolder("BigQueryIOTest").getAbsolutePath());
FakeDatasetService datasetService = new FakeDatasetService();
FakeBigQueryServices fakeBqServices = new FakeBigQueryServices().withJobService(new FakeJobService()).withDatasetService(datasetService);
datasetService.createDataset("project-id", "dataset-id", "", "");
TableRow row1 = new TableRow().set("name", "a").set("number", "1");
TableRow row2 = new TableRow().set("name", "b").set("number", "2");
TableRow row3 = new TableRow().set("name", "c").set("number", "3");
TableDataInsertAllResponse.InsertErrors ephemeralError = new TableDataInsertAllResponse.InsertErrors().setErrors(ImmutableList.of(new ErrorProto().setReason("timeout")));
TableDataInsertAllResponse.InsertErrors persistentError = new TableDataInsertAllResponse.InsertErrors().setErrors(ImmutableList.of(new ErrorProto().setReason("invalidQuery")));
datasetService.failOnInsert(ImmutableMap.<TableRow, List<TableDataInsertAllResponse.InsertErrors>>of(row1, ImmutableList.of(ephemeralError, ephemeralError), row2, ImmutableList.of(ephemeralError, ephemeralError, persistentError)));
Pipeline p = TestPipeline.create(bqOptions);
PCollection<TableRow> failedRows = p.apply(Create.of(row1, row2, row3)).setIsBoundedInternal(IsBounded.UNBOUNDED).apply(BigQueryIO.writeTableRows().to("project-id:dataset-id.table-id").withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED).withSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()).withTestServices(fakeBqServices).withoutValidation()).getFailedInserts();
// row2 finally fails with a non-retryable error, so we expect to see it in the collection of
// failed rows.
PAssert.that(failedRows).containsInAnyOrder(row2);
p.run();
// Only row1 and row3 were successfully inserted.
assertThat(datasetService.getAllRows("project-id", "dataset-id", "table-id"), containsInAnyOrder(row1, row3));
}
Aggregations