use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write in project beam by apache.
the class BigQueryIOWriteTest method testWriteAvroWithCustomWriter.
@Test
public void testWriteAvroWithCustomWriter() throws Exception {
if (useStorageApi || useStreaming) {
return;
}
SerializableFunction<AvroWriteRequest<InputRecord>, GenericRecord> formatFunction = r -> {
GenericRecord rec = new GenericData.Record(r.getSchema());
InputRecord i = r.getElement();
rec.put("strVal", i.strVal());
rec.put("longVal", i.longVal());
rec.put("doubleVal", i.doubleVal());
rec.put("instantVal", i.instantVal().getMillis() * 1000);
return rec;
};
SerializableFunction<org.apache.avro.Schema, DatumWriter<GenericRecord>> customWriterFactory = s -> new GenericDatumWriter<GenericRecord>() {
@Override
protected void writeString(org.apache.avro.Schema schema, Object datum, Encoder out) throws IOException {
super.writeString(schema, datum.toString() + "_custom", out);
}
};
p.apply(Create.of(InputRecord.create("test", 1, 1.0, Instant.parse("2019-01-01T00:00:00Z")), InputRecord.create("test2", 2, 2.0, Instant.parse("2019-02-01T00:00:00Z"))).withCoder(INPUT_RECORD_CODER)).apply(BigQueryIO.<InputRecord>write().to("dataset-id.table-id").withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).withSchema(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("strVal").setType("STRING"), new TableFieldSchema().setName("longVal").setType("INTEGER"), new TableFieldSchema().setName("doubleVal").setType("FLOAT"), new TableFieldSchema().setName("instantVal").setType("TIMESTAMP")))).withTestServices(fakeBqServices).withAvroWriter(formatFunction, customWriterFactory).withoutValidation());
p.run();
assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", "table-id"), containsInAnyOrder(new TableRow().set("strVal", "test_custom").set("longVal", "1").set("doubleVal", 1.0D).set("instantVal", "2019-01-01 00:00:00 UTC"), new TableRow().set("strVal", "test2_custom").set("longVal", "2").set("doubleVal", 2.0D).set("instantVal", "2019-02-01 00:00:00 UTC")));
}
use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write in project beam by apache.
the class BigQueryIOWriteTest method testWriteValidateFailsWithAvroFormatAndStreamingInserts.
@Test
public void testWriteValidateFailsWithAvroFormatAndStreamingInserts() {
if (!useStreaming && !useStorageApi) {
return;
}
p.enableAbandonedNodeEnforcement(false);
thrown.expect(IllegalArgumentException.class);
thrown.expectMessage("Writing avro formatted data is only supported for FILE_LOADS");
p.apply(Create.empty(INPUT_RECORD_CODER)).apply(BigQueryIO.<InputRecord>write().to("dataset.table").withSchema(new TableSchema()).withAvroFormatFunction(r -> new GenericData.Record(r.getSchema())).withMethod(Method.STREAMING_INSERTS).withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED));
}
use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write in project beam by apache.
the class BigQueryIOWriteTest method writeDynamicDestinations.
public void writeDynamicDestinations(boolean schemas, boolean autoSharding) throws Exception {
final Schema schema = Schema.builder().addField("name", FieldType.STRING).addField("id", FieldType.INT32).build();
final Pattern userPattern = Pattern.compile("([a-z]+)([0-9]+)");
final PCollectionView<List<String>> sideInput1 = p.apply("Create SideInput 1", Create.of("a", "b", "c").withCoder(StringUtf8Coder.of())).apply("asList", View.asList());
final PCollectionView<Map<String, String>> sideInput2 = p.apply("Create SideInput2", Create.of(KV.of("a", "a"), KV.of("b", "b"), KV.of("c", "c"))).apply("AsMap", View.asMap());
final List<String> allUsernames = ImmutableList.of("bill", "bob", "randolph");
List<String> userList = Lists.newArrayList();
// WriteGroupedRecordsToFiles.
for (int i = 0; i < BatchLoads.DEFAULT_MAX_NUM_WRITERS_PER_BUNDLE * 10; ++i) {
// Every user has 10 nicknames.
for (int j = 0; j < 10; ++j) {
String nickname = allUsernames.get(ThreadLocalRandom.current().nextInt(allUsernames.size()));
userList.add(nickname + i);
}
}
PCollection<String> users = p.apply("CreateUsers", Create.of(userList)).apply(Window.into(new PartitionedGlobalWindows<>(arg -> arg)));
if (useStreaming) {
users = users.setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED);
}
if (schemas) {
users = users.setSchema(schema, TypeDescriptors.strings(), user -> {
Matcher matcher = userPattern.matcher(user);
checkState(matcher.matches());
return Row.withSchema(schema).addValue(matcher.group(1)).addValue(Integer.valueOf(matcher.group(2))).build();
}, r -> r.getString(0) + r.getInt32(1));
}
// Use a partition decorator to verify that partition decorators are supported.
final String partitionDecorator = "20171127";
BigQueryIO.Write<String> write = BigQueryIO.<String>write().withTestServices(fakeBqServices).withMaxFilesPerBundle(5).withMaxFileSize(10).withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).to(new StringLongDestinations() {
@Override
public Long getDestination(ValueInSingleWindow<String> element) {
assertThat(element.getWindow(), Matchers.instanceOf(PartitionedGlobalWindow.class));
Matcher matcher = userPattern.matcher(element.getValue());
checkState(matcher.matches());
// a table.
return Long.valueOf(matcher.group(2));
}
@Override
public TableDestination getTable(Long userId) {
verifySideInputs();
// Each user in it's own table.
return new TableDestination("dataset-id.userid-" + userId + "$" + partitionDecorator, "table for userid " + userId);
}
@Override
public TableSchema getSchema(Long userId) {
verifySideInputs();
return new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("id").setType("INTEGER")));
}
@Override
public List<PCollectionView<?>> getSideInputs() {
return ImmutableList.of(sideInput1, sideInput2);
}
private void verifySideInputs() {
assertThat(sideInput(sideInput1), containsInAnyOrder("a", "b", "c"));
Map<String, String> mapSideInput = sideInput(sideInput2);
assertEquals(3, mapSideInput.size());
assertThat(mapSideInput, allOf(hasEntry("a", "a"), hasEntry("b", "b"), hasEntry("c", "c")));
}
}).withoutValidation();
if (schemas) {
write = write.useBeamSchema();
} else {
write = write.withFormatFunction(user -> {
Matcher matcher = userPattern.matcher(user);
checkState(matcher.matches());
return new TableRow().set("name", matcher.group(1)).set("id", matcher.group(2));
});
}
if (autoSharding) {
write = write.withAutoSharding();
}
WriteResult results = users.apply("WriteBigQuery", write);
if (!useStreaming && !useStorageApi) {
PCollection<TableDestination> successfulBatchInserts = results.getSuccessfulTableLoads();
TableDestination[] expectedTables = userList.stream().map(user -> {
Matcher matcher = userPattern.matcher(user);
checkState(matcher.matches());
String userId = matcher.group(2);
return new TableDestination(String.format("project-id:dataset-id.userid-%s$20171127", userId), String.format("table for userid %s", userId));
}).distinct().toArray(TableDestination[]::new);
PAssert.that(successfulBatchInserts.apply(Distinct.create())).containsInAnyOrder(expectedTables);
}
p.run();
Map<Long, List<TableRow>> expectedTableRows = Maps.newHashMap();
for (String anUserList : userList) {
Matcher matcher = userPattern.matcher(anUserList);
checkState(matcher.matches());
String nickname = matcher.group(1);
Long userid = Long.valueOf(matcher.group(2));
List<TableRow> expected = expectedTableRows.computeIfAbsent(userid, k -> Lists.newArrayList());
expected.add(new TableRow().set("name", nickname).set("id", userid.toString()));
}
for (Map.Entry<Long, List<TableRow>> entry : expectedTableRows.entrySet()) {
assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", "userid-" + entry.getKey()), containsInAnyOrder(Iterables.toArray(entry.getValue(), TableRow.class)));
}
}
use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write in project beam by apache.
the class BigQueryIOWriteTest method testWriteWithDynamicTables.
@Test
public void testWriteWithDynamicTables() throws Exception {
List<Integer> inserts = new ArrayList<>();
for (int i = 0; i < 10; i++) {
inserts.add(i);
}
// Create a windowing strategy that puts the input into five different windows depending on
// record value.
WindowFn<Integer, PartitionedGlobalWindow> windowFn = new PartitionedGlobalWindows<>(i -> Integer.toString(i % 5));
final Map<Integer, TableDestination> targetTables = Maps.newHashMap();
Map<String, String> schemas = Maps.newHashMap();
for (int i = 0; i < 5; i++) {
TableDestination destination = new TableDestination("project-id:dataset-id" + ".table-id-" + i, "");
targetTables.put(i, destination);
// Make sure each target table has its own custom table.
schemas.put(destination.getTableSpec(), toJsonString(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER"), new TableFieldSchema().setName("custom_" + i).setType("STRING")))));
}
SerializableFunction<ValueInSingleWindow<Integer>, TableDestination> tableFunction = input -> {
PartitionedGlobalWindow window = (PartitionedGlobalWindow) input.getWindow();
// Check that we can access the element as well here and that it matches the window.
checkArgument(window.value.equals(Integer.toString(input.getValue() % 5)), "Incorrect element");
return targetTables.get(input.getValue() % 5);
};
PCollection<Integer> input = p.apply("CreateSource", Create.of(inserts));
if (useStreaming) {
input = input.setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED);
}
PCollectionView<Map<String, String>> schemasView = p.apply("CreateSchemaMap", Create.of(schemas)).apply("ViewSchemaAsMap", View.asMap());
input.apply(Window.into(windowFn)).apply(BigQueryIO.<Integer>write().to(tableFunction).withFormatFunction(i -> new TableRow().set("name", "number" + i).set("number", Integer.toString(i))).withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).withSchemaFromView(schemasView).withTestServices(fakeBqServices).withoutValidation());
p.run();
for (int i = 0; i < 5; ++i) {
String tableId = String.format("table-id-%d", i);
String tableSpec = String.format("project-id:dataset-id.%s", tableId);
// Verify that table was created with the correct schema.
assertThat(toJsonString(fakeDatasetService.getTable(new TableReference().setProjectId("project-id").setDatasetId("dataset-id").setTableId(tableId)).getSchema()), equalTo(schemas.get(tableSpec)));
// Verify that the table has the expected contents.
assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", tableId), containsInAnyOrder(new TableRow().set("name", String.format("number%d", i)).set("number", Integer.toString(i)), new TableRow().set("name", String.format("number%d", i + 5)).set("number", Integer.toString(i + 5))));
}
}
use of org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.Write in project beam by apache.
the class BigQueryIOWriteTest method testSchemaWriteLoads.
@Test
public void testSchemaWriteLoads() throws Exception {
// withMethod overrides the pipeline option, so we need to explicitly request
// STORAGE_API_WRITES.
BigQueryIO.Write.Method method = useStorageApi ? (useStorageApiApproximate ? Method.STORAGE_API_AT_LEAST_ONCE : Method.STORAGE_WRITE_API) : Method.FILE_LOADS;
p.apply(Create.of(new SchemaPojo("a", 1), new SchemaPojo("b", 2), new SchemaPojo("c", 3), new SchemaPojo("d", 4))).apply(BigQueryIO.<SchemaPojo>write().to("project-id:dataset-id.table-id").withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).withMethod(method).useBeamSchema().withTestServices(fakeBqServices).withoutValidation());
p.run();
System.err.println("Wrote: " + fakeDatasetService.getAllRows("project-id", "dataset-id", "table-id"));
assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", "table-id"), containsInAnyOrder(new TableRow().set("name", "a").set("number", "1"), new TableRow().set("name", "b").set("number", "2"), new TableRow().set("name", "c").set("number", "3"), new TableRow().set("name", "d").set("number", "4")));
}
Aggregations