use of org.apache.beam.sdk.testutils.metrics.TimeMonitor in project beam by apache.
the class JdbcIOIT method runRead.
/**
* Read the test dataset from postgres and validate its contents.
*
* <p>When doing the validation, we wish to ensure that we: 1. Ensure *all* the rows are correct
* 2. Provide enough information in assertions such that it is easy to spot obvious errors (e.g.
* all elements have a similar mistake, or "only 5 elements were generated" and the user wants to
* see what the problem was.
*
* <p>We do not wish to generate and compare all of the expected values, so this method uses
* hashing to ensure that all expected data is present. However, hashing does not provide easy
* debugging information (failures like "every element was empty string" are hard to see), so we
* also: 1. Generate expected values for the first and last 500 rows 2. Use containsInAnyOrder to
* verify that their values are correct. Where first/last 500 rows is determined by the fact that
* we know all rows have a unique id - we can use the natural ordering of that key.
*/
private PipelineResult runRead() {
PCollection<TestRow> namesAndIds = pipelineRead.apply(JdbcIO.<TestRow>read().withDataSourceConfiguration(JdbcIO.DataSourceConfiguration.create(dataSource)).withQuery(String.format("select name,id from %s;", tableName)).withRowMapper(new JdbcTestHelper.CreateTestRowOfNameAndId())).apply(ParDo.of(new TimeMonitor<>(NAMESPACE, "read_time")));
PAssert.thatSingleton(namesAndIds.apply("Count All", Count.globally())).isEqualTo((long) numberOfRows);
PCollection<String> consolidatedHashcode = namesAndIds.apply(ParDo.of(new TestRow.SelectNameFn())).apply("Hash row contents", Combine.globally(new HashingFn()).withoutDefaults());
PAssert.that(consolidatedHashcode).containsInAnyOrder(TestRow.getExpectedHashForRowCount(numberOfRows));
PCollection<List<TestRow>> frontOfList = namesAndIds.apply(Top.smallest(500));
Iterable<TestRow> expectedFrontOfList = TestRow.getExpectedValues(0, 500);
PAssert.thatSingletonIterable(frontOfList).containsInAnyOrder(expectedFrontOfList);
PCollection<List<TestRow>> backOfList = namesAndIds.apply(Top.largest(500));
Iterable<TestRow> expectedBackOfList = TestRow.getExpectedValues(numberOfRows - 500, numberOfRows);
PAssert.thatSingletonIterable(backOfList).containsInAnyOrder(expectedBackOfList);
return pipelineRead.run();
}
use of org.apache.beam.sdk.testutils.metrics.TimeMonitor in project beam by apache.
the class KafkaIOIT method testKafkaIOReadsAndWritesCorrectlyInBatch.
@Test
public void testKafkaIOReadsAndWritesCorrectlyInBatch() throws IOException {
// Map of hashes of set size collections with 100b records - 10b key, 90b values.
Map<Long, String> expectedHashes = ImmutableMap.of(1000L, "4507649971ee7c51abbb446e65a5c660", 100_000_000L, "0f12c27c9a7672e14775594be66cad9a");
expectedHashcode = getHashForRecordCount(sourceOptions.numRecords, expectedHashes);
writePipeline.apply("Generate records", Read.from(new SyntheticBoundedSource(sourceOptions))).apply("Measure write time", ParDo.of(new TimeMonitor<>(NAMESPACE, WRITE_TIME_METRIC_NAME))).apply("Write to Kafka", writeToKafka());
PCollection<String> hashcode = readPipeline.apply("Read from bounded Kafka", readFromBoundedKafka()).apply("Measure read time", ParDo.of(new TimeMonitor<>(NAMESPACE, READ_TIME_METRIC_NAME))).apply("Map records to strings", MapElements.via(new MapKafkaRecordsToStrings())).apply("Calculate hashcode", Combine.globally(new HashingFn()).withoutDefaults());
PAssert.thatSingleton(hashcode).isEqualTo(expectedHashcode);
PipelineResult writeResult = writePipeline.run();
writeResult.waitUntilFinish();
PipelineResult readResult = readPipeline.run();
PipelineResult.State readState = readResult.waitUntilFinish(Duration.standardSeconds(options.getReadTimeout()));
cancelIfTimeouted(readResult, readState);
if (!options.isWithTestcontainers()) {
Set<NamedTestResult> metrics = readMetrics(writeResult, readResult);
IOITMetrics.publishToInflux(TEST_ID, TIMESTAMP, metrics, settings);
}
}
use of org.apache.beam.sdk.testutils.metrics.TimeMonitor in project beam by apache.
the class XmlIOIT method writeThenReadAll.
@Test
public void writeThenReadAll() {
PCollection<String> testFileNames = pipeline.apply("Generate sequence", GenerateSequence.from(0).to(numberOfTextLines)).apply("Create xml records", MapElements.via(new LongToBird())).apply("Gather write start time", ParDo.of(new TimeMonitor<>(XMLIOIT_NAMESPACE, "writeStart"))).apply("Write xml files", FileIO.<Bird>write().via(XmlIO.sink(Bird.class).withRootElement("birds").withCharset(charset)).to(filenamePrefix).withPrefix("birds").withSuffix(".xml")).getPerDestinationOutputFilenames().apply("Gather write end time", ParDo.of(new TimeMonitor<>(XMLIOIT_NAMESPACE, "writeEnd"))).apply("Get file names", Values.create());
PCollection<Bird> birds = testFileNames.apply("Find files", FileIO.matchAll()).apply("Read matched files", FileIO.readMatches()).apply("Gather read start time", ParDo.of(new TimeMonitor<>(XMLIOIT_NAMESPACE, "readStart"))).apply("Read xml files", XmlIO.<Bird>readFiles().withRecordClass(Bird.class).withRootElement("birds").withRecordElement("bird").withCharset(charset)).apply("Gather read end time", ParDo.of(new TimeMonitor<>(XMLIOIT_NAMESPACE, "readEnd")));
PCollection<String> consolidatedHashcode = birds.apply("Map xml records to strings", MapElements.via(new BirdToString())).apply("Calculate hashcode", Combine.globally(new HashingFn()));
PAssert.thatSingleton(consolidatedHashcode).isEqualTo(expectedHash);
testFileNames.apply("Delete test files", ParDo.of(new FileBasedIOITHelper.DeleteFileFn()).withSideInputs(consolidatedHashcode.apply(View.asSingleton())));
PipelineResult result = pipeline.run();
result.waitUntilFinish();
collectAndPublishResults(result);
}
use of org.apache.beam.sdk.testutils.metrics.TimeMonitor in project beam by apache.
the class ParquetIOIT method writeThenReadAll.
@Test
public void writeThenReadAll() {
PCollection<String> testFiles = pipeline.apply("Generate sequence", GenerateSequence.from(0).to(numberOfTextLines)).apply("Produce text lines", ParDo.of(new FileBasedIOITHelper.DeterministicallyConstructTestTextLineFn())).apply("Produce Avro records", ParDo.of(new DeterministicallyConstructAvroRecordsFn())).setCoder(AvroCoder.of(SCHEMA)).apply("Gather write start times", ParDo.of(new TimeMonitor<>(PARQUET_NAMESPACE, "writeStart"))).apply("Write Parquet files", FileIO.<GenericRecord>write().via(ParquetIO.sink(SCHEMA)).to(filenamePrefix)).getPerDestinationOutputFilenames().apply("Gather write end times", ParDo.of(new TimeMonitor<>(PARQUET_NAMESPACE, "writeEnd"))).apply("Get file names", Values.create());
PCollection<String> consolidatedHashcode = testFiles.apply("Find files", FileIO.matchAll()).apply("Read matched files", FileIO.readMatches()).apply("Gather read start time", ParDo.of(new TimeMonitor<>(PARQUET_NAMESPACE, "readStart"))).apply("Read parquet files", ParquetIO.readFiles(SCHEMA)).apply("Gather read end time", ParDo.of(new TimeMonitor<>(PARQUET_NAMESPACE, "readEnd"))).apply("Map records to strings", MapElements.into(strings()).via((SerializableFunction<GenericRecord, String>) record -> String.valueOf(record.get("row")))).apply("Calculate hashcode", Combine.globally(new HashingFn()));
PAssert.thatSingleton(consolidatedHashcode).isEqualTo(expectedHash);
testFiles.apply("Delete test files", ParDo.of(new FileBasedIOITHelper.DeleteFileFn()).withSideInputs(consolidatedHashcode.apply(View.asSingleton())));
PipelineResult result = pipeline.run();
result.waitUntilFinish();
collectAndPublishMetrics(result);
}
Aggregations