use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.
the class BigQueryIOWriteTest method testWriteAvro.
@Test
public void testWriteAvro() throws Exception {
if (useStorageApi || useStreaming) {
return;
}
p.apply(Create.of(InputRecord.create("test", 1, 1.0, Instant.parse("2019-01-01T00:00:00Z")), InputRecord.create("test2", 2, 2.0, Instant.parse("2019-02-01T00:00:00Z"))).withCoder(INPUT_RECORD_CODER)).apply(BigQueryIO.<InputRecord>write().to("dataset-id.table-id").withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).withSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("strVal").setType("STRING"), new TableFieldSchema().setName("longVal").setType("INTEGER"), new TableFieldSchema().setName("doubleVal").setType("FLOAT"), new TableFieldSchema().setName("instantVal").setType("TIMESTAMP")))).withTestServices(fakeBqServices).withAvroFormatFunction(r -> {
GenericRecord rec = new GenericData.Record(r.getSchema());
InputRecord i = r.getElement();
rec.put("strVal", i.strVal());
rec.put("longVal", i.longVal());
rec.put("doubleVal", i.doubleVal());
rec.put("instantVal", i.instantVal().getMillis() * 1000);
return rec;
}).withoutValidation());
p.run();
assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", "table-id"), containsInAnyOrder(new TableRow().set("strVal", "test").set("longVal", "1").set("doubleVal", 1.0D).set("instantVal", "2019-01-01 00:00:00 UTC"), new TableRow().set("strVal", "test2").set("longVal", "2").set("doubleVal", 2.0D).set("instantVal", "2019-02-01 00:00:00 UTC")));
}
use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.
the class BigQueryIOWriteTest method testTriggeredFileLoadsWithTempTablesAndDataset.
@Test
public void testTriggeredFileLoadsWithTempTablesAndDataset() throws Exception {
String tableRef = "bigquery-project-id:dataset-id.table-id";
List<TableRow> elements = Lists.newArrayList();
for (int i = 0; i < 30; ++i) {
elements.add(new TableRow().set("number", i));
}
TestStream<TableRow> testStream = TestStream.create(TableRowJsonCoder.of()).addElements(elements.get(0), Iterables.toArray(elements.subList(1, 10), TableRow.class)).advanceProcessingTime(Duration.standardMinutes(1)).addElements(elements.get(10), Iterables.toArray(elements.subList(11, 20), TableRow.class)).advanceProcessingTime(Duration.standardMinutes(1)).addElements(elements.get(20), Iterables.toArray(elements.subList(21, 30), TableRow.class)).advanceWatermarkToInfinity();
BigQueryIO.Write.Method method = Method.FILE_LOADS;
p.apply(testStream).apply(BigQueryIO.writeTableRows().to(tableRef).withSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("number").setType("INTEGER")))).withTestServices(fakeBqServices).withTriggeringFrequency(Duration.standardSeconds(30)).withNumFileShards(2).withMaxBytesPerPartition(1).withMaxFilesPerPartition(1).withMethod(method).withoutValidation().withWriteTempDataset("temp-dataset-id"));
p.run();
final int projectIdSplitter = tableRef.indexOf(':');
final String projectId = projectIdSplitter == -1 ? "project-id" : tableRef.substring(0, projectIdSplitter);
assertThat(fakeDatasetService.getAllRows(projectId, "dataset-id", "table-id"), containsInAnyOrder(Iterables.toArray(elements, TableRow.class)));
}
use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.
the class BigQueryIOWriteTest method writeDynamicDestinations.
public void writeDynamicDestinations(boolean schemas, boolean autoSharding) throws Exception {
final Schema schema = Schema.builder().addField("name", FieldType.STRING).addField("id", FieldType.INT32).build();
final Pattern userPattern = Pattern.compile("([a-z]+)([0-9]+)");
final PCollectionView<List<String>> sideInput1 = p.apply("Create SideInput 1", Create.of("a", "b", "c").withCoder(StringUtf8Coder.of())).apply("asList", View.asList());
final PCollectionView<Map<String, String>> sideInput2 = p.apply("Create SideInput2", Create.of(KV.of("a", "a"), KV.of("b", "b"), KV.of("c", "c"))).apply("AsMap", View.asMap());
final List<String> allUsernames = ImmutableList.of("bill", "bob", "randolph");
List<String> userList = Lists.newArrayList();
// WriteGroupedRecordsToFiles.
for (int i = 0; i < BatchLoads.DEFAULT_MAX_NUM_WRITERS_PER_BUNDLE * 10; ++i) {
// Every user has 10 nicknames.
for (int j = 0; j < 10; ++j) {
String nickname = allUsernames.get(ThreadLocalRandom.current().nextInt(allUsernames.size()));
userList.add(nickname + i);
}
}
PCollection<String> users = p.apply("CreateUsers", Create.of(userList)).apply(Window.into(new PartitionedGlobalWindows<>(arg -> arg)));
if (useStreaming) {
users = users.setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED);
}
if (schemas) {
users = users.setSchema(schema, TypeDescriptors.strings(), user -> {
Matcher matcher = userPattern.matcher(user);
checkState(matcher.matches());
return Row.withSchema(schema).addValue(matcher.group(1)).addValue(Integer.valueOf(matcher.group(2))).build();
}, r -> r.getString(0) + r.getInt32(1));
}
// Use a partition decorator to verify that partition decorators are supported.
final String partitionDecorator = "20171127";
BigQueryIO.Write<String> write = BigQueryIO.<String>write().withTestServices(fakeBqServices).withMaxFilesPerBundle(5).withMaxFileSize(10).withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).to(new StringLongDestinations() {
@Override
public Long getDestination(ValueInSingleWindow<String> element) {
assertThat(element.getWindow(), Matchers.instanceOf(PartitionedGlobalWindow.class));
Matcher matcher = userPattern.matcher(element.getValue());
checkState(matcher.matches());
// a table.
return Long.valueOf(matcher.group(2));
}
@Override
public TableDestination getTable(Long userId) {
verifySideInputs();
// Each user in it's own table.
return new TableDestination("dataset-id.userid-" + userId + "$" + partitionDecorator, "table for userid " + userId);
}
@Override
public TableSchema getSchema(Long userId) {
verifySideInputs();
return new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("id").setType("INTEGER")));
}
@Override
public List<PCollectionView<?>> getSideInputs() {
return ImmutableList.of(sideInput1, sideInput2);
}
private void verifySideInputs() {
assertThat(sideInput(sideInput1), containsInAnyOrder("a", "b", "c"));
Map<String, String> mapSideInput = sideInput(sideInput2);
assertEquals(3, mapSideInput.size());
assertThat(mapSideInput, allOf(hasEntry("a", "a"), hasEntry("b", "b"), hasEntry("c", "c")));
}
}).withoutValidation();
if (schemas) {
write = write.useBeamSchema();
} else {
write = write.withFormatFunction(user -> {
Matcher matcher = userPattern.matcher(user);
checkState(matcher.matches());
return new TableRow().set("name", matcher.group(1)).set("id", matcher.group(2));
});
}
if (autoSharding) {
write = write.withAutoSharding();
}
WriteResult results = users.apply("WriteBigQuery", write);
if (!useStreaming && !useStorageApi) {
PCollection<TableDestination> successfulBatchInserts = results.getSuccessfulTableLoads();
TableDestination[] expectedTables = userList.stream().map(user -> {
Matcher matcher = userPattern.matcher(user);
checkState(matcher.matches());
String userId = matcher.group(2);
return new TableDestination(String.format("project-id:dataset-id.userid-%s$20171127", userId), String.format("table for userid %s", userId));
}).distinct().toArray(TableDestination[]::new);
PAssert.that(successfulBatchInserts.apply(Distinct.create())).containsInAnyOrder(expectedTables);
}
p.run();
Map<Long, List<TableRow>> expectedTableRows = Maps.newHashMap();
for (String anUserList : userList) {
Matcher matcher = userPattern.matcher(anUserList);
checkState(matcher.matches());
String nickname = matcher.group(1);
Long userid = Long.valueOf(matcher.group(2));
List<TableRow> expected = expectedTableRows.computeIfAbsent(userid, k -> Lists.newArrayList());
expected.add(new TableRow().set("name", nickname).set("id", userid.toString()));
}
for (Map.Entry<Long, List<TableRow>> entry : expectedTableRows.entrySet()) {
assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", "userid-" + entry.getKey()), containsInAnyOrder(Iterables.toArray(entry.getValue(), TableRow.class)));
}
}
use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.
the class BigQueryIOWriteTest method testRetryPolicy.
@Test
public void testRetryPolicy() throws Exception {
if (useStorageApi || !useStreaming) {
return;
}
TableRow row1 = new TableRow().set("name", "a").set("number", "1");
TableRow row2 = new TableRow().set("name", "b").set("number", "2");
TableRow row3 = new TableRow().set("name", "c").set("number", "3");
TableDataInsertAllResponse.InsertErrors ephemeralError = new TableDataInsertAllResponse.InsertErrors().setErrors(ImmutableList.of(new ErrorProto().setReason("timeout")));
TableDataInsertAllResponse.InsertErrors persistentError = new TableDataInsertAllResponse.InsertErrors().setErrors(ImmutableList.of(new ErrorProto().setReason("invalidQuery")));
fakeDatasetService.failOnInsert(ImmutableMap.of(row1, ImmutableList.of(ephemeralError, ephemeralError), row2, ImmutableList.of(ephemeralError, ephemeralError, persistentError)));
WriteResult result = p.apply(Create.of(row1, row2, row3)).apply(BigQueryIO.writeTableRows().to("project-id:dataset-id.table-id").withCreateDisposition(CreateDisposition.CREATE_IF_NEEDED).withMethod(Method.STREAMING_INSERTS).withSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER")))).withFailedInsertRetryPolicy(InsertRetryPolicy.retryTransientErrors()).withTestServices(fakeBqServices).withoutValidation());
PCollection<TableRow> failedRows = result.getFailedInserts();
// row2 finally fails with a non-retryable error, so we expect to see it in the collection of
// failed rows.
PAssert.that(failedRows).containsInAnyOrder(row2);
if (useStorageApi || !useStreaming) {
PAssert.that(result.getSuccessfulInserts()).containsInAnyOrder(row1, row3);
}
p.run();
// Only row1 and row3 were successfully inserted.
assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", "table-id"), containsInAnyOrder(row1, row3));
}
use of com.google.api.services.bigquery.model.TableFieldSchema in project beam by apache.
the class BigQueryIOReadTest method testBigQueryQuerySourceInitSplit_NoReferencedTables.
/**
* This test simulates the scenario where the SQL text which is executed by the query job doesn't
* by itself refer to any tables (e.g. "SELECT 17 AS value"), and thus there are no referenced
* tables when the dry run of the query is performed.
*/
@Test
public void testBigQueryQuerySourceInitSplit_NoReferencedTables() throws Exception {
PipelineOptions options = PipelineOptionsFactory.create();
BigQueryOptions bqOptions = options.as(BigQueryOptions.class);
bqOptions.setProject("project");
Table queryResultTable = new Table().setSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER"))));
List<TableRow> expected = ImmutableList.of(new TableRow().set("name", "a").set("number", 1L), new TableRow().set("name", "b").set("number", 2L), new TableRow().set("name", "c").set("number", 3L), new TableRow().set("name", "d").set("number", 4L), new TableRow().set("name", "e").set("number", 5L), new TableRow().set("name", "f").set("number", 6L));
String encodedQuery = FakeBigQueryServices.encodeQueryResult(queryResultTable, expected);
String stepUuid = "testStepUuid";
fakeJobService.expectDryRunQuery(bqOptions.getProject(), encodedQuery, new JobStatistics().setQuery(new JobStatistics2().setTotalBytesProcessed(100L).setReferencedTables(ImmutableList.of())));
BoundedSource<TableRow> bqSource = BigQueryQuerySourceDef.create(fakeBqServices, ValueProvider.StaticValueProvider.of(encodedQuery), true, /* flattenResults */
true, /* useLegacySql */
QueryPriority.BATCH, null, null, null).toSource(stepUuid, TableRowJsonCoder.of(), BigQueryIO.TableRowParser.INSTANCE, false);
options.setTempLocation(testFolder.getRoot().getAbsolutePath());
List<TableRow> read = convertStringsToLong(SourceTestUtils.readFromSplitsOfSource(bqSource, 0L, /* ignored */
options));
assertThat(read, containsInAnyOrder(Iterables.toArray(expected, TableRow.class)));
List<? extends BoundedSource<TableRow>> sources = bqSource.split(100, options);
assertEquals(2, sources.size());
}
Aggregations