Search in sources :

Example 11 with TimeMonitor

use of org.apache.beam.sdk.testutils.metrics.TimeMonitor in project beam by apache.

the class JdbcIOIT method runRead.

/**
 * Read the test dataset from postgres and validate its contents.
 *
 * <p>When doing the validation, we wish to ensure that we: 1. Ensure *all* the rows are correct
 * 2. Provide enough information in assertions such that it is easy to spot obvious errors (e.g.
 * all elements have a similar mistake, or "only 5 elements were generated" and the user wants to
 * see what the problem was.
 *
 * <p>We do not wish to generate and compare all of the expected values, so this method uses
 * hashing to ensure that all expected data is present. However, hashing does not provide easy
 * debugging information (failures like "every element was empty string" are hard to see), so we
 * also: 1. Generate expected values for the first and last 500 rows 2. Use containsInAnyOrder to
 * verify that their values are correct. Where first/last 500 rows is determined by the fact that
 * we know all rows have a unique id - we can use the natural ordering of that key.
 */
private PipelineResult runRead() {
    PCollection<TestRow> namesAndIds = pipelineRead.apply(JdbcIO.<TestRow>read().withDataSourceConfiguration(JdbcIO.DataSourceConfiguration.create(dataSource)).withQuery(String.format("select name,id from %s;", tableName)).withRowMapper(new JdbcTestHelper.CreateTestRowOfNameAndId())).apply(ParDo.of(new TimeMonitor<>(NAMESPACE, "read_time")));
    PAssert.thatSingleton(namesAndIds.apply("Count All", Count.globally())).isEqualTo((long) numberOfRows);
    PCollection<String> consolidatedHashcode = namesAndIds.apply(ParDo.of(new TestRow.SelectNameFn())).apply("Hash row contents", Combine.globally(new HashingFn()).withoutDefaults());
    PAssert.that(consolidatedHashcode).containsInAnyOrder(TestRow.getExpectedHashForRowCount(numberOfRows));
    PCollection<List<TestRow>> frontOfList = namesAndIds.apply(Top.smallest(500));
    Iterable<TestRow> expectedFrontOfList = TestRow.getExpectedValues(0, 500);
    PAssert.thatSingletonIterable(frontOfList).containsInAnyOrder(expectedFrontOfList);
    PCollection<List<TestRow>> backOfList = namesAndIds.apply(Top.largest(500));
    Iterable<TestRow> expectedBackOfList = TestRow.getExpectedValues(numberOfRows - 500, numberOfRows);
    PAssert.thatSingletonIterable(backOfList).containsInAnyOrder(expectedBackOfList);
    return pipelineRead.run();
}
Also used : TimeMonitor(org.apache.beam.sdk.testutils.metrics.TimeMonitor) TestRow(org.apache.beam.sdk.io.common.TestRow) ArrayList(java.util.ArrayList) List(java.util.List) HashingFn(org.apache.beam.sdk.io.common.HashingFn)

Example 12 with TimeMonitor

use of org.apache.beam.sdk.testutils.metrics.TimeMonitor in project beam by apache.

the class KafkaIOIT method testKafkaIOReadsAndWritesCorrectlyInBatch.

@Test
public void testKafkaIOReadsAndWritesCorrectlyInBatch() throws IOException {
    // Map of hashes of set size collections with 100b records - 10b key, 90b values.
    Map<Long, String> expectedHashes = ImmutableMap.of(1000L, "4507649971ee7c51abbb446e65a5c660", 100_000_000L, "0f12c27c9a7672e14775594be66cad9a");
    expectedHashcode = getHashForRecordCount(sourceOptions.numRecords, expectedHashes);
    writePipeline.apply("Generate records", Read.from(new SyntheticBoundedSource(sourceOptions))).apply("Measure write time", ParDo.of(new TimeMonitor<>(NAMESPACE, WRITE_TIME_METRIC_NAME))).apply("Write to Kafka", writeToKafka());
    PCollection<String> hashcode = readPipeline.apply("Read from bounded Kafka", readFromBoundedKafka()).apply("Measure read time", ParDo.of(new TimeMonitor<>(NAMESPACE, READ_TIME_METRIC_NAME))).apply("Map records to strings", MapElements.via(new MapKafkaRecordsToStrings())).apply("Calculate hashcode", Combine.globally(new HashingFn()).withoutDefaults());
    PAssert.thatSingleton(hashcode).isEqualTo(expectedHashcode);
    PipelineResult writeResult = writePipeline.run();
    writeResult.waitUntilFinish();
    PipelineResult readResult = readPipeline.run();
    PipelineResult.State readState = readResult.waitUntilFinish(Duration.standardSeconds(options.getReadTimeout()));
    cancelIfTimeouted(readResult, readState);
    if (!options.isWithTestcontainers()) {
        Set<NamedTestResult> metrics = readMetrics(writeResult, readResult);
        IOITMetrics.publishToInflux(TEST_ID, TIMESTAMP, metrics, settings);
    }
}
Also used : TimeMonitor(org.apache.beam.sdk.testutils.metrics.TimeMonitor) PipelineResult(org.apache.beam.sdk.PipelineResult) SyntheticOptions.fromJsonString(org.apache.beam.sdk.io.synthetic.SyntheticOptions.fromJsonString) HashingFn(org.apache.beam.sdk.io.common.HashingFn) SyntheticBoundedSource(org.apache.beam.sdk.io.synthetic.SyntheticBoundedSource) NamedTestResult(org.apache.beam.sdk.testutils.NamedTestResult) Test(org.junit.Test)

Example 13 with TimeMonitor

use of org.apache.beam.sdk.testutils.metrics.TimeMonitor in project beam by apache.

the class XmlIOIT method writeThenReadAll.

@Test
public void writeThenReadAll() {
    PCollection<String> testFileNames = pipeline.apply("Generate sequence", GenerateSequence.from(0).to(numberOfTextLines)).apply("Create xml records", MapElements.via(new LongToBird())).apply("Gather write start time", ParDo.of(new TimeMonitor<>(XMLIOIT_NAMESPACE, "writeStart"))).apply("Write xml files", FileIO.<Bird>write().via(XmlIO.sink(Bird.class).withRootElement("birds").withCharset(charset)).to(filenamePrefix).withPrefix("birds").withSuffix(".xml")).getPerDestinationOutputFilenames().apply("Gather write end time", ParDo.of(new TimeMonitor<>(XMLIOIT_NAMESPACE, "writeEnd"))).apply("Get file names", Values.create());
    PCollection<Bird> birds = testFileNames.apply("Find files", FileIO.matchAll()).apply("Read matched files", FileIO.readMatches()).apply("Gather read start time", ParDo.of(new TimeMonitor<>(XMLIOIT_NAMESPACE, "readStart"))).apply("Read xml files", XmlIO.<Bird>readFiles().withRecordClass(Bird.class).withRootElement("birds").withRecordElement("bird").withCharset(charset)).apply("Gather read end time", ParDo.of(new TimeMonitor<>(XMLIOIT_NAMESPACE, "readEnd")));
    PCollection<String> consolidatedHashcode = birds.apply("Map xml records to strings", MapElements.via(new BirdToString())).apply("Calculate hashcode", Combine.globally(new HashingFn()));
    PAssert.thatSingleton(consolidatedHashcode).isEqualTo(expectedHash);
    testFileNames.apply("Delete test files", ParDo.of(new FileBasedIOITHelper.DeleteFileFn()).withSideInputs(consolidatedHashcode.apply(View.asSingleton())));
    PipelineResult result = pipeline.run();
    result.waitUntilFinish();
    collectAndPublishResults(result);
}
Also used : TimeMonitor(org.apache.beam.sdk.testutils.metrics.TimeMonitor) FileBasedIOITHelper(org.apache.beam.sdk.io.common.FileBasedIOITHelper) PipelineResult(org.apache.beam.sdk.PipelineResult) HashingFn(org.apache.beam.sdk.io.common.HashingFn) Test(org.junit.Test)

Example 14 with TimeMonitor

use of org.apache.beam.sdk.testutils.metrics.TimeMonitor in project beam by apache.

the class ParquetIOIT method writeThenReadAll.

@Test
public void writeThenReadAll() {
    PCollection<String> testFiles = pipeline.apply("Generate sequence", GenerateSequence.from(0).to(numberOfTextLines)).apply("Produce text lines", ParDo.of(new FileBasedIOITHelper.DeterministicallyConstructTestTextLineFn())).apply("Produce Avro records", ParDo.of(new DeterministicallyConstructAvroRecordsFn())).setCoder(AvroCoder.of(SCHEMA)).apply("Gather write start times", ParDo.of(new TimeMonitor<>(PARQUET_NAMESPACE, "writeStart"))).apply("Write Parquet files", FileIO.<GenericRecord>write().via(ParquetIO.sink(SCHEMA)).to(filenamePrefix)).getPerDestinationOutputFilenames().apply("Gather write end times", ParDo.of(new TimeMonitor<>(PARQUET_NAMESPACE, "writeEnd"))).apply("Get file names", Values.create());
    PCollection<String> consolidatedHashcode = testFiles.apply("Find files", FileIO.matchAll()).apply("Read matched files", FileIO.readMatches()).apply("Gather read start time", ParDo.of(new TimeMonitor<>(PARQUET_NAMESPACE, "readStart"))).apply("Read parquet files", ParquetIO.readFiles(SCHEMA)).apply("Gather read end time", ParDo.of(new TimeMonitor<>(PARQUET_NAMESPACE, "readEnd"))).apply("Map records to strings", MapElements.into(strings()).via((SerializableFunction<GenericRecord, String>) record -> String.valueOf(record.get("row")))).apply("Calculate hashcode", Combine.globally(new HashingFn()));
    PAssert.thatSingleton(consolidatedHashcode).isEqualTo(expectedHash);
    testFiles.apply("Delete test files", ParDo.of(new FileBasedIOITHelper.DeleteFileFn()).withSideInputs(consolidatedHashcode.apply(View.asSingleton())));
    PipelineResult result = pipeline.run();
    result.waitUntilFinish();
    collectAndPublishMetrics(result);
}
Also used : TimeMonitor(org.apache.beam.sdk.testutils.metrics.TimeMonitor) GenericRecordBuilder(org.apache.avro.generic.GenericRecordBuilder) FileIO(org.apache.beam.sdk.io.FileIO) BeforeClass(org.junit.BeforeClass) PipelineResult(org.apache.beam.sdk.PipelineResult) MetricsReader(org.apache.beam.sdk.testutils.metrics.MetricsReader) Combine(org.apache.beam.sdk.transforms.Combine) RunWith(org.junit.runner.RunWith) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) View(org.apache.beam.sdk.transforms.View) Timestamp(com.google.cloud.Timestamp) FileBasedIOITHelper(org.apache.beam.sdk.io.common.FileBasedIOITHelper) Function(java.util.function.Function) HashSet(java.util.HashSet) InfluxDBSettings(org.apache.beam.sdk.testutils.publishing.InfluxDBSettings) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) TypeDescriptors.strings(org.apache.beam.sdk.values.TypeDescriptors.strings) NamedTestResult(org.apache.beam.sdk.testutils.NamedTestResult) DoFn(org.apache.beam.sdk.transforms.DoFn) MapElements(org.apache.beam.sdk.transforms.MapElements) GenericRecord(org.apache.avro.generic.GenericRecord) HashingFn(org.apache.beam.sdk.io.common.HashingFn) Schema(org.apache.avro.Schema) FileBasedIOITHelper.readFileBasedIOITPipelineOptions(org.apache.beam.sdk.io.common.FileBasedIOITHelper.readFileBasedIOITPipelineOptions) PAssert(org.apache.beam.sdk.testing.PAssert) FileBasedIOITHelper.appendTimestampSuffix(org.apache.beam.sdk.io.common.FileBasedIOITHelper.appendTimestampSuffix) TimeMonitor(org.apache.beam.sdk.testutils.metrics.TimeMonitor) Set(java.util.Set) GenerateSequence(org.apache.beam.sdk.io.GenerateSequence) Test(org.junit.Test) UUID(java.util.UUID) JUnit4(org.junit.runners.JUnit4) PCollection(org.apache.beam.sdk.values.PCollection) AvroCoder(org.apache.beam.sdk.coders.AvroCoder) IOITMetrics(org.apache.beam.sdk.testutils.metrics.IOITMetrics) Rule(org.junit.Rule) ParDo(org.apache.beam.sdk.transforms.ParDo) FileBasedIOTestPipelineOptions(org.apache.beam.sdk.io.common.FileBasedIOTestPipelineOptions) Values(org.apache.beam.sdk.transforms.Values) FileBasedIOITHelper(org.apache.beam.sdk.io.common.FileBasedIOITHelper) PipelineResult(org.apache.beam.sdk.PipelineResult) GenericRecord(org.apache.avro.generic.GenericRecord) HashingFn(org.apache.beam.sdk.io.common.HashingFn) Test(org.junit.Test)

Aggregations

TimeMonitor (org.apache.beam.sdk.testutils.metrics.TimeMonitor)14 PipelineResult (org.apache.beam.sdk.PipelineResult)13 Test (org.junit.Test)11 HashingFn (org.apache.beam.sdk.io.common.HashingFn)8 FileBasedIOITHelper (org.apache.beam.sdk.io.common.FileBasedIOITHelper)4 BeamRelNode (org.apache.beam.sdk.extensions.sql.impl.rel.BeamRelNode)3 SyntheticBoundedSource (org.apache.beam.sdk.io.synthetic.SyntheticBoundedSource)3 NamedTestResult (org.apache.beam.sdk.testutils.NamedTestResult)3 ArrayList (java.util.ArrayList)2 Pipeline (org.apache.beam.sdk.Pipeline)2 DeleteFileFn (org.apache.beam.sdk.io.common.FileBasedIOITHelper.DeleteFileFn)2 TestRow (org.apache.beam.sdk.io.common.TestRow)2 TableFieldSchema (com.google.api.services.bigquery.model.TableFieldSchema)1 TableSchema (com.google.api.services.bigquery.model.TableSchema)1 Timestamp (com.google.cloud.Timestamp)1 HashSet (java.util.HashSet)1 List (java.util.List)1 Set (java.util.Set)1 UUID (java.util.UUID)1 Function (java.util.function.Function)1