use of org.apache.beam.sdk.transforms.SerializableFunction in project beam by apache.
the class ParquetIOIT method writeThenReadAll.
@Test
public void writeThenReadAll() {
PCollection<String> testFiles = pipeline.apply("Generate sequence", GenerateSequence.from(0).to(numberOfTextLines)).apply("Produce text lines", ParDo.of(new FileBasedIOITHelper.DeterministicallyConstructTestTextLineFn())).apply("Produce Avro records", ParDo.of(new DeterministicallyConstructAvroRecordsFn())).setCoder(AvroCoder.of(SCHEMA)).apply("Gather write start times", ParDo.of(new TimeMonitor<>(PARQUET_NAMESPACE, "writeStart"))).apply("Write Parquet files", FileIO.<GenericRecord>write().via(ParquetIO.sink(SCHEMA)).to(filenamePrefix)).getPerDestinationOutputFilenames().apply("Gather write end times", ParDo.of(new TimeMonitor<>(PARQUET_NAMESPACE, "writeEnd"))).apply("Get file names", Values.create());
PCollection<String> consolidatedHashcode = testFiles.apply("Find files", FileIO.matchAll()).apply("Read matched files", FileIO.readMatches()).apply("Gather read start time", ParDo.of(new TimeMonitor<>(PARQUET_NAMESPACE, "readStart"))).apply("Read parquet files", ParquetIO.readFiles(SCHEMA)).apply("Gather read end time", ParDo.of(new TimeMonitor<>(PARQUET_NAMESPACE, "readEnd"))).apply("Map records to strings", MapElements.into(strings()).via((SerializableFunction<GenericRecord, String>) record -> String.valueOf(record.get("row")))).apply("Calculate hashcode", Combine.globally(new HashingFn()));
PAssert.thatSingleton(consolidatedHashcode).isEqualTo(expectedHash);
testFiles.apply("Delete test files", ParDo.of(new FileBasedIOITHelper.DeleteFileFn()).withSideInputs(consolidatedHashcode.apply(View.asSingleton())));
PipelineResult result = pipeline.run();
result.waitUntilFinish();
collectAndPublishMetrics(result);
}
use of org.apache.beam.sdk.transforms.SerializableFunction in project beam by apache.
the class BigQueryIOWriteTest method testWriteWithDynamicTables.
@Test
public void testWriteWithDynamicTables() throws Exception {
List<Integer> inserts = new ArrayList<>();
for (int i = 0; i < 10; i++) {
inserts.add(i);
}
// Create a windowing strategy that puts the input into five different windows depending on
// record value.
WindowFn<Integer, PartitionedGlobalWindow> windowFn = new PartitionedGlobalWindows<>(i -> Integer.toString(i % 5));
final Map<Integer, TableDestination> targetTables = Maps.newHashMap();
Map<String, String> schemas = Maps.newHashMap();
for (int i = 0; i < 5; i++) {
TableDestination destination = new TableDestination("project-id:dataset-id" + ".table-id-" + i, "");
targetTables.put(i, destination);
// Make sure each target table has its own custom table.
schemas.put(destination.getTableSpec(), toJsonString(new TableSchema().setFields(ImmutableList.of(new TableFieldSchema().setName("name").setType("STRING"), new TableFieldSchema().setName("number").setType("INTEGER"), new TableFieldSchema().setName("custom_" + i).setType("STRING")))));
}
SerializableFunction<ValueInSingleWindow<Integer>, TableDestination> tableFunction = input -> {
PartitionedGlobalWindow window = (PartitionedGlobalWindow) input.getWindow();
// Check that we can access the element as well here and that it matches the window.
checkArgument(window.value.equals(Integer.toString(input.getValue() % 5)), "Incorrect element");
return targetTables.get(input.getValue() % 5);
};
PCollection<Integer> input = p.apply("CreateSource", Create.of(inserts));
if (useStreaming) {
input = input.setIsBoundedInternal(PCollection.IsBounded.UNBOUNDED);
}
PCollectionView<Map<String, String>> schemasView = p.apply("CreateSchemaMap", Create.of(schemas)).apply("ViewSchemaAsMap", View.asMap());
input.apply(Window.into(windowFn)).apply(BigQueryIO.<Integer>write().to(tableFunction).withFormatFunction(i -> new TableRow().set("name", "number" + i).set("number", Integer.toString(i))).withCreateDisposition(BigQueryIO.Write.CreateDisposition.CREATE_IF_NEEDED).withSchemaFromView(schemasView).withTestServices(fakeBqServices).withoutValidation());
p.run();
for (int i = 0; i < 5; ++i) {
String tableId = String.format("table-id-%d", i);
String tableSpec = String.format("project-id:dataset-id.%s", tableId);
// Verify that table was created with the correct schema.
assertThat(toJsonString(fakeDatasetService.getTable(new TableReference().setProjectId("project-id").setDatasetId("dataset-id").setTableId(tableId)).getSchema()), equalTo(schemas.get(tableSpec)));
// Verify that the table has the expected contents.
assertThat(fakeDatasetService.getAllRows("project-id", "dataset-id", tableId), containsInAnyOrder(new TableRow().set("name", String.format("number%d", i)).set("number", Integer.toString(i)), new TableRow().set("name", String.format("number%d", i + 5)).set("number", Integer.toString(i + 5))));
}
}
Aggregations