use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.
the class BigQueryIOWriteTest method testExtendedErrorRetrieval.
@Test
public void testExtendedErrorRetrieval() throws Exception {
if (useStorageApi) {
return;
}
TableRow row1 = new TableRow().set("name", "a").set("number", "1");
TableRow row2 = new TableRow().set("name", "b").set("number", "2");
TableRow row3 = new TableRow().set("name", "c").set("number", "3");
String tableSpec = "project-id:dataset-id.table-id";
TableDataInsertAllResponse.InsertErrors ephemeralError = new TableDataInsertAllResponse.InsertErrors().setErrors(ImmutableList.of(new ErrorProto().setReason("timeout")));
TableDataInsertAllResponse.InsertErrors persistentError = new TableDataInsertAllResponse.InsertErrors().setErrors(Lists.newArrayList(new ErrorProto().setReason("invalidQuery")));
fakeDatasetService.failOnInsert(ImmutableMap.of(row1, ImmutableList.of(ephemeralError, ephemeralError), row2, ImmutableList.of(ephemeralError, ephemeralError, persistentError)));
PCollection<BigQueryInsertError> failedRows = p.apply(Create.of(row1, row2, row3)).apply(BigQueryIO.writeTableRows().to(tableSpec).withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).withMethod(BigQueryIO.Write.Method.STREAMING_INSERTS).withSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()).withTestServices(fakeBqServices).withoutValidation().withExtendedErrorInfo()).getFailedInsertsWithErr();
// row2 finally fails with a non-retryable error, so we expect to see it in the collection of
// failed rows.
PAssert.that(failedRows).containsInAnyOrder(new BigQueryInsertError(row2, persistentError, BigQueryHelpers.parseTableSpec(tableSpec)));
p.run();
// Only row1 and row3 were successfully inserted.
assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", "table-id"), containsInAnyOrder(row1, row3));
}
use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.
the class BigQueryIOWriteTest method testClusteringTableFunction.
@Test
public void testClusteringTableFunction() throws Exception {
TableRow row1 = new TableRow().set("date", "2018-01-01").set("number", "1");
TableRow row2 = new TableRow().set("date", "2018-01-02").set("number", "2");
TimePartitioning timePartitioning = new TimePartitioning().setType("DAY").setField("date");
Clustering clustering = new Clustering().setFields(ImmutableList.of("date"));
TableSchema schema = new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("date").setType("DATE"), new TableFieldSchema().setName("number").setType("INTEGER")));
// withMethod overrides the pipeline option, so we need to explicitly request
// STORAGE_API_WRITES.
BigQueryIO.Write.Method method = useStorageApi ? (useStorageApiApproximate ? Method.STORAGE_API_AT_LEAST_ONCE : Method.STORAGE_WRITE_API) : Method.FILE_LOADS;
p.apply(Create.of(row1, row2)).apply(BigQueryIO.writeTableRows().to((ValueInSingleWindow<TableRow> vsw) -> {
String tableSpec = "project-id:dataset-id.table-" + vsw.getValue().get("number");
return new TableDestination(tableSpec, null, new TimePartitioning().setType("DAY").setField("date"), new Clustering().setFields(ImmutableList.of("date")));
}).withTestServices(fakeBqServices).withMethod(method).withSchema(schema).withClustering().withoutValidation());
p.run();
Table table = fakeDatasetService.getTable(BigQueryHelpers.parseTableSpec("project-id:dataset-id.table-1"));
assertEquals(schema, table.getSchema());
assertEquals(timePartitioning, table.getTimePartitioning());
assertEquals(clustering, table.getClustering());
}
use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.
the class BigQueryIOWriteTest method testTriggeredFileLoadsWithAutoSharding.
@Test
public void testTriggeredFileLoadsWithAutoSharding() throws Exception {
if (useStorageApi || !useStreaming) {
// This test does not make sense for the storage API.
return;
}
List<TableRow> elements = Lists.newArrayList();
for (int i = 0; i < 30; ++i) {
elements.add(new TableRow().set("number", i));
}
Instant startInstant = new Instant(0L);
TestStream<TableRow> testStream = TestStream.create(TableRowJsonCoder.of()).advanceWatermarkTo(startInstant).addElements(elements.get(0), Iterables.toArray(elements.subList(1, 10), TableRow.class)).advanceProcessingTime(Duration.standardMinutes(1)).advanceWatermarkTo(startInstant.plus(Duration.standardSeconds(10))).addElements(elements.get(10), Iterables.toArray(elements.subList(11, 20), TableRow.class)).advanceProcessingTime(Duration.standardMinutes(1)).advanceWatermarkTo(startInstant.plus(Duration.standardSeconds(30))).addElements(elements.get(20), Iterables.toArray(elements.subList(21, 30), TableRow.class)).advanceProcessingTime(Duration.standardMinutes(2)).advanceWatermarkToInfinity();
int numTables = 3;
p.apply(testStream).apply(BigQueryIO.writeTableRows().to((ValueInSingleWindow<TableRow> vsw) -> {
String tableSpec = "project-id:dataset-id.table-" + ((int) vsw.getValue().get("number") % numTables);
return new TableDestination(tableSpec, null);
}).withSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("number").setType("INTEGER")))).withTestServices(fakeBqServices).withTriggeringFrequency(Duration.standardSeconds(100)).withAutoSharding().withMaxBytesPerPartition(1000).withMaxFilesPerPartition(10).withMethod(BigQueryIO.Write.Method.FILE_LOADS).withoutValidation());
p.run();
Map<Integer, List<TableRow>> elementsByTableIdx = new HashMap<>();
for (int i = 0; i < elements.size(); i++) {
elementsByTableIdx.computeIfAbsent(i % numTables, k -> new ArrayList<>()).add(elements.get(i));
}
for (Map.Entry<Integer, List<TableRow>> entry : elementsByTableIdx.entrySet()) {
assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", "table-" + entry.getKey()), containsInAnyOrder(Iterables.toArray(entry.getValue(), TableRow.class)));
}
// For each table destination, it's expected to create two load jobs based on the triggering
// frequency and processing time intervals.
assertEquals(2 * numTables, fakeDatasetService.getInsertCount());
}
use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.
the class BigQueryIOWriteTest method streamingWrite.
private void streamingWrite(boolean autoSharding) throws Exception {
if (!useStreaming) {
return;
}
BigQueryIO.Write<TableRow> write = BigQueryIO.writeTableRows().to("project-id:dataset-id.table-id").withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).withSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))).withTestServices(fakeBqServices).withoutValidation();
if (autoSharding) {
write = write.withAutoSharding();
}
p.apply(Create.of(new TableRow().set("name", "a").set("number", "1"), new TableRow().set("name", "b").set("number", "2"), new TableRow().set("name", "c").set("number", "3"), new TableRow().set("name", "d").set("number", "4")).withCoder(TableRowJsonCoder.of())).setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED).apply("WriteToBQ", write);
p.run();
assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", "table-id"), containsInAnyOrder(new TableRow().set("name", "a").set("number", "1"), new TableRow().set("name", "b").set("number", "2"), new TableRow().set("name", "c").set("number", "3"), new TableRow().set("name", "d").set("number", "4")));
}
use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.
the class BigQueryIOWriteTest method testTriggeredFileLoads.
@Test
public void testTriggeredFileLoads() throws Exception {
if (useStorageApi || !useStreaming) {
return;
}
List<TableRow> elements = Lists.newArrayList();
for (int i = 0; i < 30; ++i) {
elements.add(new TableRow().set("number", i));
}
TestStream<TableRow> testStream = TestStream.create(TableRowJsonCoder.of()).addElements(elements.get(0), Iterables.toArray(elements.subList(1, 10), TableRow.class)).advanceProcessingTime(Duration.standardMinutes(1)).addElements(elements.get(10), Iterables.toArray(elements.subList(11, 20), TableRow.class)).advanceProcessingTime(Duration.standardMinutes(1)).addElements(elements.get(20), Iterables.toArray(elements.subList(21, 30), TableRow.class)).advanceWatermarkToInfinity();
BigQueryIO.Write.Method method = Method.FILE_LOADS;
p.apply(testStream).apply(BigQueryIO.writeTableRows().to("project-id:dataset-id.table-id").withSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("number").setType("INTEGER")))).withTestServices(fakeBqServices).withTriggeringFrequency(Duration.standardSeconds(30)).withNumFileShards(2).withMethod(method).withoutValidation());
p.run();
assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", "table-id"), containsInAnyOrder(Iterables.toArray(elements, TableRow.class)));
}
Aggregations