use of org.apache.beam.sdk.transforms.Distinct in project beam by apache.
the class BigQueryIOWriteTest method writeDynamicDestinations.
public void writeDynamicDestinations(boolean schemas, boolean autoSharding) throws Exception {
final Schema schema = Schema.builder().addField("name", FieldType.STRING).addField("id", FieldType.INT32).build();
final Pattern userPattern = Pattern.compile("([a-z]+)([0-9]+)");
final PCollectionView<List<String>> sideInput1 = p.apply("Create SideInput 1", Create.of("a", "b", "c").withCoder(StringUtf8Coder.of())).apply("asList", View.asList());
final PCollectionView<Map<String, String>> sideInput2 = p.apply("Create SideInput2", Create.of(KV.of("a", "a"), KV.of("b", "b"), KV.of("c", "c"))).apply("AsMap", View.asMap());
final List<String> allUsernames = ImmutableList.of("bill", "bob", "randolph");
List<String> userList = Lists.newArrayList();
// WriteGroupedRecordsToFiles.
for (int i = 0; i < BatchLoads.DEFAULT_MAX_NUM_WRITERS_PER_BUNDLE * 10; ++i) {
// Every user has 10 nicknames.
for (int j = 0; j < 10; ++j) {
String nickname = allUsernames.get(ThreadLocalRandom.current().nextInt(allUsernames.size()));
userList.add(nickname + i);
}
}
PCollection<String> users = p.apply("CreateUsers", Create.of(userList)).apply(Window.into(new PartitionedGlobalWindows<>(arg -> arg)));
if (useStreaming) {
users = users.setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED);
}
if (schemas) {
users = users.setSchema(schema, TypeDescriptors.strings(), user -> {
Matcher matcher = userPattern.matcher(user);
checkState(matcher.matches());
return Row.withSchema(schema).addValue(matcher.group(1)).addValue(Integer.valueOf(matcher.group(2))).build();
}, r -> r.getString(0) + r.getInt32(1));
}
// Use a partition decorator to verify that partition decorators are supported.
final String partitionDecorator = "20171127";
BigQueryIO.Write<String> write = BigQueryIO.<String>write().withTestServices(fakeBqServices).withMaxFilesPerBundle(5).withMaxFileSize(10).withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).to(new StringLongDestinations() {
@Override
public Long getDestination(ValueInSingleWindow<String> element) {
assertThat(element.getWindow(), Matchers.instanceOf(PartitionedGlobalWindow.class));
Matcher matcher = userPattern.matcher(element.getValue());
checkState(matcher.matches());
// a table.
return Long.valueOf(matcher.group(2));
}
@Override
public TableDestination getTable(Long userId) {
verifySideInputs();
// Each user in it's own table.
return new TableDestination("dataset-id.userid-" + userId + "$" + partitionDecorator, "table for userid " + userId);
}
@Override
public TableSchema getSchema(Long userId) {
verifySideInputs();
return new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("id").setType("INTEGER")));
}
@Override
public List<PCollectionView<?>> getSideInputs() {
return ImmutableList.of(sideInput1, sideInput2);
}
private void verifySideInputs() {
assertThat(sideInput(sideInput1), containsInAnyOrder("a", "b", "c"));
Map<String, String> mapSideInput = sideInput(sideInput2);
assertEquals(3, mapSideInput.size());
assertThat(mapSideInput, allOf(hasEntry("a", "a"), hasEntry("b", "b"), hasEntry("c", "c")));
}
}).withoutValidation();
if (schemas) {
write = write.useBeamSchema();
} else {
write = write.withFormatFunction(user -> {
Matcher matcher = userPattern.matcher(user);
checkState(matcher.matches());
return new TableRow().set("name", matcher.group(1)).set("id", matcher.group(2));
});
}
if (autoSharding) {
write = write.withAutoSharding();
}
WriteResult results = users.apply("WriteBigQuery", write);
if (!useStreaming && !useStorageApi) {
PCollection<TableDestination> successfulBatchInserts = results.getSuccessfulTableLoads();
TableDestination[] expectedTables = userList.stream().map(user -> {
Matcher matcher = userPattern.matcher(user);
checkState(matcher.matches());
String userId = matcher.group(2);
return new TableDestination(String.format("project-id:dataset-id.userid-%s$20171127", userId), String.format("table for userid %s", userId));
}).distinct().toArray(TableDestination[]::new);
PAssert.that(successfulBatchInserts.apply(Distinct.create())).containsInAnyOrder(expectedTables);
}
p.run();
Map<Long, List<TableRow>> expectedTableRows = Maps.newHashMap();
for (String anUserList : userList) {
Matcher matcher = userPattern.matcher(anUserList);
checkState(matcher.matches());
String nickname = matcher.group(1);
Long userid = Long.valueOf(matcher.group(2));
List<TableRow> expected = expectedTableRows.computeIfAbsent(userid, k -> Lists.newArrayList());
expected.add(new TableRow().set("name", nickname).set("id", userid.toString()));
}
for (Map.Entry<Long, List<TableRow>> entry : expectedTableRows.entrySet()) {
assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", "userid-" + entry.getKey()), containsInAnyOrder(Iterables.toArray(entry.getValue(), TableRow.class)));
}
}
Aggregations