use of org.apache.beam.sdk.transforms.SerializableFunction in project beam by apache.
the class JdbcIOTest method testReadRowsWithoutStatementPreparator.
@Test
public void testReadRowsWithoutStatementPreparator() {
SerializableFunction<Void, DataSource> dataSourceProvider = ignored -> DATA_SOURCE;
String name = TestRow.getNameForSeed(1);
PCollection<Row> rows = pipeline.apply(JdbcIO.readRows().withDataSourceProviderFn(dataSourceProvider).withQuery(String.format("select name,id from %s where name = '%s'", READ_TABLE_NAME, name)));
Schema expectedSchema = Schema.of(Schema.Field.of("NAME", LogicalTypes.variableLengthString(JDBCType.VARCHAR, 500)).withNullable(true), Schema.Field.of("ID", Schema.FieldType.INT32).withNullable(true));
assertEquals(expectedSchema, rows.getSchema());
PCollection<Row> output = rows.apply(Select.fieldNames("NAME", "ID"));
PAssert.that(output).containsInAnyOrder(ImmutableList.of(Row.withSchema(expectedSchema).addValues(name, 1).build()));
pipeline.run();
}
use of org.apache.beam.sdk.transforms.SerializableFunction in project beam by apache.
the class BigQueryHllSketchCompatibilityIT method readSketchFromBigQuery.
private void readSketchFromBigQuery(String tableId, Long expectedCount) {
String tableSpec = String.format("%s.%s", DATASET_ID, tableId);
String query = String.format("SELECT HLL_COUNT.INIT(%s) AS %s FROM %s", DATA_FIELD_NAME, QUERY_RESULT_FIELD_NAME, tableSpec);
SerializableFunction<SchemaAndRecord, byte[]> parseQueryResultToByteArray = input -> HllCount.getSketchFromByteBuffer((ByteBuffer) input.getRecord().get(QUERY_RESULT_FIELD_NAME));
TestPipelineOptions options = TestPipeline.testingPipelineOptions().as(TestPipelineOptions.class);
Pipeline p = Pipeline.create(options);
PCollection<Long> result = p.apply(BigQueryIO.read(parseQueryResultToByteArray).withFormat(DataFormat.AVRO).fromQuery(query).usingStandardSql().withMethod(Method.DIRECT_READ).withCoder(ByteArrayCoder.of())).apply(// no-op, only for testing MergePartial
HllCount.MergePartial.globally()).apply(HllCount.Extract.globally());
PAssert.thatSingleton(result).isEqualTo(expectedCount);
p.run().waitUntilFinish();
}
use of org.apache.beam.sdk.transforms.SerializableFunction in project beam by apache.
the class WatermarkPolicyTest method shouldAdvanceWatermarkWithCustomTimePolicy.
@Test
public void shouldAdvanceWatermarkWithCustomTimePolicy() {
SerializableFunction<KinesisRecord, Instant> timestampFn = (record) -> record.getApproximateArrivalTimestamp().plus(Duration.standardMinutes(1));
WatermarkPolicy policy = WatermarkPolicyFactory.withCustomWatermarkPolicy(WatermarkParameters.create().withTimestampFn(timestampFn)).createWatermarkPolicy();
KinesisRecord a = mock(KinesisRecord.class);
KinesisRecord b = mock(KinesisRecord.class);
Instant time1 = NOW.minus(standardSeconds(30L));
Instant time2 = NOW.minus(standardSeconds(20L));
when(a.getApproximateArrivalTimestamp()).thenReturn(time1);
when(b.getApproximateArrivalTimestamp()).thenReturn(time2);
policy.update(a);
assertThat(policy.getWatermark()).isEqualTo(time1.plus(Duration.standardMinutes(1)));
policy.update(b);
assertThat(policy.getWatermark()).isEqualTo(time2.plus(Duration.standardMinutes(1)));
}
use of org.apache.beam.sdk.transforms.SerializableFunction in project beam by apache.
the class ElasticsearchIOTestCommon method testMaxParallelRequestsPerWindow.
void testMaxParallelRequestsPerWindow() throws Exception {
List<Document> data = ElasticsearchIOTestUtils.createDocuments(numDocs, ElasticsearchIOTestUtils.InjectionMode.DO_NOT_INJECT_INVALID_DOCS).stream().map(doc -> Document.create().withInputDoc(doc).withTimestamp(Instant.now())).collect(Collectors.toList());
Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration).withMaxParallelRequestsPerWindow(1);
PCollection<KV<Integer, Iterable<Document>>> batches = pipeline.apply(Create.of(data)).apply(StatefulBatching.fromSpec(write.getBulkIO()));
PCollection<Integer> keyValues = batches.apply(MapElements.into(integers()).via((SerializableFunction<KV<Integer, Iterable<Document>>, Integer>) KV::getKey));
// Number of unique keys produced should be number of maxParallelRequestsPerWindow * numWindows
// There is only 1 request (key) per window, and 1 (global) window ie. one key total where
// key value is 0
PAssert.that(keyValues).containsInAnyOrder(0);
PAssert.that(batches).satisfies(new AssertThatHasExpectedContents(0, data));
pipeline.run();
}
use of org.apache.beam.sdk.transforms.SerializableFunction in project beam by apache.
the class BigQuerySourceBase method createSources.
List<BoundedSource<T>> createSources(List<ResourceId> files, TableSchema schema, List<MatchResult.Metadata> metadata) throws IOException, InterruptedException {
final String jsonSchema = BigQueryIO.JSON_FACTORY.toString(schema);
SerializableFunction<GenericRecord, T> fnWrapper = new SerializableFunction<GenericRecord, T>() {
private Supplier<TableSchema> schema = Suppliers.memoize(Suppliers.compose(new TableSchemaFunction(), Suppliers.ofInstance(jsonSchema)));
@Override
public T apply(GenericRecord input) {
return parseFn.apply(new SchemaAndRecord(input, schema.get()));
}
};
List<BoundedSource<T>> avroSources = Lists.newArrayList();
// mode.
if (metadata != null) {
for (MatchResult.Metadata file : metadata) {
avroSources.add(AvroSource.from(file).withParseFn(fnWrapper, getOutputCoder()));
}
} else {
for (ResourceId file : files) {
avroSources.add(AvroSource.from(file.toString()).withParseFn(fnWrapper, getOutputCoder()));
}
}
return ImmutableList.copyOf(avroSources);
}
Aggregations