use of org.apache.beam.sdk.io.common.TestRow in project beam by apache.
the class DynamoDBIOIT method runWrite.
/**
* Write test dataset to DynamoDB.
*/
private void runWrite() {
int rows = env.options().getNumberOfRows();
pipelineWrite.apply("Generate Sequence", GenerateSequence.from(0).to(rows)).apply("Prepare TestRows", ParDo.of(new DeterministicallyConstructTestRowFn())).apply("Write to DynamoDB", DynamoDBIO.<TestRow>write().withAwsClientsProvider(clientProvider()).withWriteRequestMapperFn(row -> buildWriteRequest(row)));
pipelineWrite.run().waitUntilFinish();
}
use of org.apache.beam.sdk.io.common.TestRow in project beam by apache.
the class DynamoDBIOIT method runWrite.
/**
* Write test dataset to DynamoDB.
*/
private void runWrite() {
int rows = env.options().getNumberOfRows();
pipelineWrite.apply("Generate Sequence", GenerateSequence.from(0).to(rows)).apply("Prepare TestRows", ParDo.of(new DeterministicallyConstructTestRowFn())).apply("Write to DynamoDB", DynamoDBIO.<TestRow>write().withWriteRequestMapperFn(row -> buildWriteRequest(row)));
pipelineWrite.run().waitUntilFinish();
}
use of org.apache.beam.sdk.io.common.TestRow in project beam by apache.
the class StreamingSnowflakeIOIT method readDataFromStream.
private Set<TestRow> readDataFromStream() throws SQLException {
Connection connection = dc.buildDatasource().getConnection();
PreparedStatement statement = connection.prepareStatement(String.format("SELECT * FROM %s", TABLE));
ResultSet resultSet = statement.executeQuery();
Set<TestRow> testRows = resultSetToJavaSet(resultSet);
resultSet.close();
statement.close();
connection.close();
return testRows;
}
use of org.apache.beam.sdk.io.common.TestRow in project beam by apache.
the class SnsIOIT method testWriteThenRead.
@Test
public void testWriteThenRead() {
ITOptions opts = env.options();
int rows = opts.getNumberOfRows();
// Write test dataset to SNS
pipelineWrite.apply("Generate Sequence", GenerateSequence.from(0).to(rows)).apply("Prepare TestRows", ParDo.of(new DeterministicallyConstructTestRowFn())).apply("Write to SNS", SnsIO.<TestRow>write().withTopicArn(resources.snsTopic).withPublishRequestBuilder(r -> PublishRequest.builder().message(r.name())));
// Read test dataset from SQS.
PCollection<String> output = pipelineRead.apply("Read from SQS", SqsIO.read().withQueueUrl(resources.sqsQueue).withMaxNumRecords(rows)).apply("Extract message", MapElements.into(strings()).via(SnsIOIT::extractMessage));
PAssert.thatSingleton(output.apply("Count All", Count.globally())).isEqualTo((long) rows);
PAssert.that(output.apply(Combine.globally(new HashingFn()).withoutDefaults())).containsInAnyOrder(getExpectedHashForRowCount(rows));
pipelineWrite.run();
pipelineRead.run();
}
use of org.apache.beam.sdk.io.common.TestRow in project beam by apache.
the class JdbcIOIT method runRead.
/**
* Read the test dataset from postgres and validate its contents.
*
* <p>When doing the validation, we wish to ensure that we: 1. Ensure *all* the rows are correct
* 2. Provide enough information in assertions such that it is easy to spot obvious errors (e.g.
* all elements have a similar mistake, or "only 5 elements were generated" and the user wants to
* see what the problem was.
*
* <p>We do not wish to generate and compare all of the expected values, so this method uses
* hashing to ensure that all expected data is present. However, hashing does not provide easy
* debugging information (failures like "every element was empty string" are hard to see), so we
* also: 1. Generate expected values for the first and last 500 rows 2. Use containsInAnyOrder to
* verify that their values are correct. Where first/last 500 rows is determined by the fact that
* we know all rows have a unique id - we can use the natural ordering of that key.
*/
private PipelineResult runRead() {
PCollection<TestRow> namesAndIds = pipelineRead.apply(JdbcIO.<TestRow>read().withDataSourceConfiguration(JdbcIO.DataSourceConfiguration.create(dataSource)).withQuery(String.format("select name,id from %s;", tableName)).withRowMapper(new JdbcTestHelper.CreateTestRowOfNameAndId())).apply(ParDo.of(new TimeMonitor<>(NAMESPACE, "read_time")));
PAssert.thatSingleton(namesAndIds.apply("Count All", Count.globally())).isEqualTo((long) numberOfRows);
PCollection<String> consolidatedHashcode = namesAndIds.apply(ParDo.of(new TestRow.SelectNameFn())).apply("Hash row contents", Combine.globally(new HashingFn()).withoutDefaults());
PAssert.that(consolidatedHashcode).containsInAnyOrder(TestRow.getExpectedHashForRowCount(numberOfRows));
PCollection<List<TestRow>> frontOfList = namesAndIds.apply(Top.smallest(500));
Iterable<TestRow> expectedFrontOfList = TestRow.getExpectedValues(0, 500);
PAssert.thatSingletonIterable(frontOfList).containsInAnyOrder(expectedFrontOfList);
PCollection<List<TestRow>> backOfList = namesAndIds.apply(Top.largest(500));
Iterable<TestRow> expectedBackOfList = TestRow.getExpectedValues(numberOfRows - 500, numberOfRows);
PAssert.thatSingletonIterable(backOfList).containsInAnyOrder(expectedBackOfList);
return pipelineRead.run();
}
Aggregations