Examples with SerializableFunction - org.apache.beam.sdk.transforms.SerializableFunction

Example 31 with SerializableFunction

use of org.apache.beam.sdk.transforms.SerializableFunction in project beam by apache.

the class JdbcIOTest method testReadRowsWithoutStatementPreparator.

@Test
public void testReadRowsWithoutStatementPreparator() {
    SerializableFunction<Void, DataSource> dataSourceProvider = ignored -> DATA_SOURCE;
    String name = TestRow.getNameForSeed(1);
    PCollection<Row> rows = pipeline.apply(JdbcIO.readRows().withDataSourceProviderFn(dataSourceProvider).withQuery(String.format("select name,id from %s where name = '%s'", READ_TABLE_NAME, name)));
    Schema expectedSchema = Schema.of(Schema.Field.of("NAME", LogicalTypes.variableLengthString(JDBCType.VARCHAR, 500)).withNullable(true), Schema.Field.of("ID", Schema.FieldType.INT32).withNullable(true));
    assertEquals(expectedSchema, rows.getSchema());
    PCollection<Row> output = rows.apply(Select.fieldNames("NAME", "ID"));
    PAssert.that(output).containsInAnyOrder(ImmutableList.of(Row.withSchema(expectedSchema).addValues(name, 1).build()));
    pipeline.run();
}

Also used : Count(org.apache.beam.sdk.transforms.Count) ExpectedLogs(org.apache.beam.sdk.testing.ExpectedLogs) Arrays(java.util.Arrays) PipelineExecutionException(org.apache.beam.sdk.Pipeline.PipelineExecutionException) SerializableCoder(org.apache.beam.sdk.coders.SerializableCoder) Connection(java.sql.Connection) Time(java.sql.Time) Matchers.not(org.hamcrest.Matchers.not) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) Array(java.sql.Array) PoolableDataSourceProvider(org.apache.beam.sdk.io.jdbc.JdbcIO.PoolableDataSourceProvider) BigDecimal(java.math.BigDecimal) Matchers.closeTo(org.hamcrest.Matchers.closeTo) Create(org.apache.beam.sdk.transforms.Create) Wait(org.apache.beam.sdk.transforms.Wait) PoolingDataSource(org.apache.commons.dbcp2.PoolingDataSource) RoundingMode(java.math.RoundingMode) KvCoder(org.apache.beam.sdk.coders.KvCoder) NULL(java.sql.JDBCType.NULL) TimeZone(java.util.TimeZone) Timestamp(java.sql.Timestamp) FieldType(org.apache.beam.sdk.schemas.Schema.FieldType) UUID(java.util.UUID) PreparedStatement(java.sql.PreparedStatement) LogRecord(java.util.logging.LogRecord) TypeSafeMatcher(org.hamcrest.TypeSafeMatcher) StandardCharsets(java.nio.charset.StandardCharsets) Serializable(java.io.Serializable) Matchers.instanceOf(org.hamcrest.Matchers.instanceOf) Matchers.any(org.mockito.Matchers.any) List(java.util.List) PartitioningFn(org.apache.beam.sdk.io.jdbc.JdbcUtil.PartitioningFn) ParDo(org.apache.beam.sdk.transforms.ParDo) SerializableUtils(org.apache.beam.sdk.util.SerializableUtils) Assert.assertFalse(org.junit.Assert.assertFalse) TypeDescriptors(org.apache.beam.sdk.values.TypeDescriptors) ISOChronology(org.joda.time.chrono.ISOChronology) TestStream(org.apache.beam.sdk.testing.TestStream) ImmutableList(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.collect.ImmutableList) DatabaseTestHelper.assertRowCount(org.apache.beam.sdk.io.common.DatabaseTestHelper.assertRowCount) Matchers.containsString(org.hamcrest.Matchers.containsString) Mockito.mock(org.mockito.Mockito.mock) KV(org.apache.beam.sdk.values.KV) TypeDescriptor(org.apache.beam.sdk.values.TypeDescriptor) BeforeClass(org.junit.BeforeClass) Assert.assertThrows(org.junit.Assert.assertThrows) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) NUMERIC(java.sql.JDBCType.NUMERIC) ArrayList(java.util.ArrayList) StringUtf8Coder(org.apache.beam.sdk.coders.StringUtf8Coder) FixedPrecisionNumeric(org.apache.beam.sdk.io.jdbc.LogicalTypes.FixedPrecisionNumeric) Assert.assertSame(org.junit.Assert.assertSame) JDBCType(java.sql.JDBCType) SQLException(java.sql.SQLException) Calendar(java.util.Calendar) Charset(java.nio.charset.Charset) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) DataSource(javax.sql.DataSource) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) Row(org.apache.beam.sdk.values.Row) ExpectedException(org.junit.rules.ExpectedException) Select(org.apache.beam.sdk.schemas.transforms.Select) Description(org.hamcrest.Description) PAssert(org.apache.beam.sdk.testing.PAssert) TestRow(org.apache.beam.sdk.io.common.TestRow) DataSourceConfiguration(org.apache.beam.sdk.io.jdbc.JdbcIO.DataSourceConfiguration) DateTime(org.joda.time.DateTime) Assert.assertTrue(org.junit.Assert.assertTrue) Mockito.times(org.mockito.Mockito.times) Test(org.junit.Test) Mockito.when(org.mockito.Mockito.when) JUnit4(org.junit.runners.JUnit4) PCollection(org.apache.beam.sdk.values.PCollection) Schema(org.apache.beam.sdk.schemas.Schema) Mockito.verify(org.mockito.Mockito.verify) Date(java.sql.Date) LocalDate(org.joda.time.LocalDate) Rule(org.junit.Rule) Instant(org.joda.time.Instant) Statement(java.sql.Statement) VarIntCoder(org.apache.beam.sdk.coders.VarIntCoder) Collections(java.util.Collections) DatabaseTestHelper(org.apache.beam.sdk.io.common.DatabaseTestHelper) Assert.assertEquals(org.junit.Assert.assertEquals) ArgumentMatchers.anyString(org.mockito.ArgumentMatchers.anyString) Schema(org.apache.beam.sdk.schemas.Schema) Matchers.containsString(org.hamcrest.Matchers.containsString) ArgumentMatchers.anyString(org.mockito.ArgumentMatchers.anyString) Row(org.apache.beam.sdk.values.Row) TestRow(org.apache.beam.sdk.io.common.TestRow) PoolingDataSource(org.apache.commons.dbcp2.PoolingDataSource) DataSource(javax.sql.DataSource) Test(org.junit.Test)

Example 32 with SerializableFunction

use of org.apache.beam.sdk.transforms.SerializableFunction in project beam by apache.

the class BigQueryHllSketchCompatibilityIT method readSketchFromBigQuery.

private void readSketchFromBigQuery(String tableId, Long expectedCount) {
    String tableSpec = String.format("%s.%s", DATASET_ID, tableId);
    String query = String.format("SELECT HLL_COUNT.INIT(%s) AS %s FROM %s", DATA_FIELD_NAME, QUERY_RESULT_FIELD_NAME, tableSpec);
    SerializableFunction<SchemaAndRecord, byte[]> parseQueryResultToByteArray = input -> HllCount.getSketchFromByteBuffer((ByteBuffer) input.getRecord().get(QUERY_RESULT_FIELD_NAME));
    TestPipelineOptions options = TestPipeline.testingPipelineOptions().as(TestPipelineOptions.class);
    Pipeline p = Pipeline.create(options);
    PCollection<Long> result = p.apply(BigQueryIO.read(parseQueryResultToByteArray).withFormat(DataFormat.AVRO).fromQuery(query).usingStandardSql().withMethod(Method.DIRECT_READ).withCoder(ByteArrayCoder.of())).apply(// no-op, only for testing MergePartial
    HllCount.MergePartial.globally()).apply(HllCount.Extract.globally());
    PAssert.thatSingleton(result).isEqualTo(expectedCount);
    p.run().waitUntilFinish();
}

Also used : Arrays(java.util.Arrays) TestPipelineOptions(org.apache.beam.sdk.testing.TestPipelineOptions) TypeDescriptor(org.apache.beam.sdk.values.TypeDescriptor) BeforeClass(org.junit.BeforeClass) BigqueryMatcher.queryResultHasChecksum(org.apache.beam.sdk.io.gcp.testing.BigqueryMatcher.queryResultHasChecksum) Date(java.util.Date) RunWith(org.junit.runner.RunWith) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) SchemaAndRecord(org.apache.beam.sdk.io.gcp.bigquery.SchemaAndRecord) ByteBuffer(java.nio.ByteBuffer) Create(org.apache.beam.sdk.transforms.Create) Map(java.util.Map) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) TableRow(com.google.api.services.bigquery.model.TableRow) TableSchema(com.google.api.services.bigquery.model.TableSchema) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) Pipeline(org.apache.beam.sdk.Pipeline) BigqueryClient(org.apache.beam.sdk.io.gcp.testing.BigqueryClient) TableReference(com.google.api.services.bigquery.model.TableReference) AfterClass(org.junit.AfterClass) TableFieldSchema(com.google.api.services.bigquery.model.TableFieldSchema) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) PAssert(org.apache.beam.sdk.testing.PAssert) BigQueryIO(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO) ApplicationNameOptions(org.apache.beam.sdk.options.ApplicationNameOptions) Method(org.apache.beam.sdk.io.gcp.bigquery.BigQueryIO.TypedRead.Method) Test(org.junit.Test) JUnit4(org.junit.runners.JUnit4) PCollection(org.apache.beam.sdk.values.PCollection) Collectors(java.util.stream.Collectors) Table(com.google.api.services.bigquery.model.Table) DataFormat(com.google.cloud.bigquery.storage.v1.DataFormat) List(java.util.List) ByteArrayCoder(org.apache.beam.sdk.coders.ByteArrayCoder) BigqueryMatcher.createQueryUsingStandardSql(org.apache.beam.sdk.io.gcp.testing.BigqueryMatcher.createQueryUsingStandardSql) Collections(java.util.Collections) TestPipelineOptions(org.apache.beam.sdk.testing.TestPipelineOptions) SchemaAndRecord(org.apache.beam.sdk.io.gcp.bigquery.SchemaAndRecord) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Pipeline(org.apache.beam.sdk.Pipeline)

Example 33 with SerializableFunction

use of org.apache.beam.sdk.transforms.SerializableFunction in project beam by apache.

the class WatermarkPolicyTest method shouldAdvanceWatermarkWithCustomTimePolicy.

@Test
public void shouldAdvanceWatermarkWithCustomTimePolicy() {
    SerializableFunction<KinesisRecord, Instant> timestampFn = (record) -> record.getApproximateArrivalTimestamp().plus(Duration.standardMinutes(1));
    WatermarkPolicy policy = WatermarkPolicyFactory.withCustomWatermarkPolicy(WatermarkParameters.create().withTimestampFn(timestampFn)).createWatermarkPolicy();
    KinesisRecord a = mock(KinesisRecord.class);
    KinesisRecord b = mock(KinesisRecord.class);
    Instant time1 = NOW.minus(standardSeconds(30L));
    Instant time2 = NOW.minus(standardSeconds(20L));
    when(a.getApproximateArrivalTimestamp()).thenReturn(time1);
    when(b.getApproximateArrivalTimestamp()).thenReturn(time2);
    policy.update(a);
    assertThat(policy.getWatermark()).isEqualTo(time1.plus(Duration.standardMinutes(1)));
    policy.update(b);
    assertThat(policy.getWatermark()).isEqualTo(time2.plus(Duration.standardMinutes(1)));
}

Also used : DateTimeUtils(org.joda.time.DateTimeUtils) Assertions.assertThat(org.assertj.core.api.Assertions.assertThat) Duration.standardSeconds(org.joda.time.Duration.standardSeconds) Instant(org.joda.time.Instant) After(org.junit.After) Duration(org.joda.time.Duration) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) Test(org.junit.Test) Mockito.when(org.mockito.Mockito.when) Mockito.mock(org.mockito.Mockito.mock) Instant(org.joda.time.Instant) Test(org.junit.Test)

Example 34 with SerializableFunction

use of org.apache.beam.sdk.transforms.SerializableFunction in project beam by apache.

the class ElasticsearchIOTestCommon method testMaxParallelRequestsPerWindow.

void testMaxParallelRequestsPerWindow() throws Exception {
    List<Document> data = ElasticsearchIOTestUtils.createDocuments(numDocs, ElasticsearchIOTestUtils.InjectionMode.DO_NOT_INJECT_INVALID_DOCS).stream().map(doc -> Document.create().withInputDoc(doc).withTimestamp(Instant.now())).collect(Collectors.toList());
    Write write = ElasticsearchIO.write().withConnectionConfiguration(connectionConfiguration).withMaxParallelRequestsPerWindow(1);
    PCollection<KV<Integer, Iterable<Document>>> batches = pipeline.apply(Create.of(data)).apply(StatefulBatching.fromSpec(write.getBulkIO()));
    PCollection<Integer> keyValues = batches.apply(MapElements.into(integers()).via((SerializableFunction<KV<Integer, Iterable<Document>>, Integer>) KV::getKey));
    // Number of unique keys produced should be number of maxParallelRequestsPerWindow * numWindows
    // There is only 1 request (key) per window, and 1 (global) window ie. one key total where
    // key value is 0
    PAssert.that(keyValues).containsInAnyOrder(0);
    PAssert.that(batches).satisfies(new AssertThatHasExpectedContents(0, data));
    pipeline.run();
}

Also used : Read(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Read) Count(org.apache.beam.sdk.transforms.Count) Arrays(java.util.Arrays) BiFunction(java.util.function.BiFunction) LoggerFactory(org.slf4j.LoggerFactory) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) InjectionMode(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.InjectionMode) ElasticsearchIOTestUtils.countByMatch(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.countByMatch) DoFnTester(org.apache.beam.sdk.transforms.DoFnTester) FAMOUS_SCIENTISTS(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.FAMOUS_SCIENTISTS) PipedInputStream(java.io.PipedInputStream) Create(org.apache.beam.sdk.transforms.Create) Map(java.util.Map) Is.is(org.hamcrest.core.Is.is) PCollectionTuple(org.apache.beam.sdk.values.PCollectionTuple) Assert.fail(org.junit.Assert.fail) JsonNode(com.fasterxml.jackson.databind.JsonNode) Document(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document) NStringEntity(org.apache.http.nio.entity.NStringEntity) ElasticsearchIO.getBackendVersion(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.getBackendVersion) ValueProvider(org.apache.beam.sdk.options.ValueProvider) DocToBulk(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.DocToBulk) MapElements(org.apache.beam.sdk.transforms.MapElements) ElasticsearchIOTestUtils.mapToInputId(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.mapToInputId) SourceTestUtils.readFromSource(org.apache.beam.sdk.testing.SourceTestUtils.readFromSource) CustomMatcher(org.hamcrest.CustomMatcher) ConnectionConfiguration(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.ConnectionConfiguration) HttpEntity(org.apache.http.HttpEntity) ContentType(org.apache.http.entity.ContentType) Set(java.util.Set) ElasticsearchIOTestUtils.flushAndRefreshAllIndices(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.flushAndRefreshAllIndices) Collectors(java.util.stream.Collectors) TypeSafeMatcher(org.hamcrest.TypeSafeMatcher) StandardCharsets(java.nio.charset.StandardCharsets) Serializable(java.io.Serializable) List(java.util.List) Assert.assertFalse(org.junit.Assert.assertFalse) Response(org.elasticsearch.client.Response) State(org.apache.beam.sdk.PipelineResult.State) Matchers.greaterThan(org.hamcrest.Matchers.greaterThan) SCRIPT_SOURCE(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.SCRIPT_SOURCE) BulkIO(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.BulkIO) IntStream(java.util.stream.IntStream) RestClient(org.elasticsearch.client.RestClient) TypeDescriptors.integers(org.apache.beam.sdk.values.TypeDescriptors.integers) KV(org.apache.beam.sdk.values.KV) ElasticsearchIOTestUtils.refreshIndexAndGetCurrentNumDocs(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.refreshIndexAndGetCurrentNumDocs) Duration(org.joda.time.Duration) RetryPredicate(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.RetryConfiguration.RetryPredicate) ElasticsearchIOTestUtils.insertTestDocuments(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.insertTestDocuments) PipelineOptionsFactory(org.apache.beam.sdk.options.PipelineOptionsFactory) INVALID_DOCS_IDS(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.INVALID_DOCS_IDS) ObjectNode(com.fasterxml.jackson.databind.node.ObjectNode) ArrayList(java.util.ArrayList) Write(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write) DefaultRetryPredicate(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.RetryConfiguration.DefaultRetryPredicate) ThreadLocalRandom(java.util.concurrent.ThreadLocalRandom) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) Matchers.lessThan(org.hamcrest.Matchers.lessThan) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ExpectedException(org.junit.rules.ExpectedException) BoundedElasticsearchSource(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.BoundedElasticsearchSource) Description(org.hamcrest.Description) SourceTestUtils(org.apache.beam.sdk.testing.SourceTestUtils) Logger(org.slf4j.Logger) StatefulBatching(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.BulkIO.StatefulBatching) PAssert(org.apache.beam.sdk.testing.PAssert) ElasticsearchIOTestUtils.countByScientistName(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.countByScientistName) DocumentCoder(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.DocumentCoder) Assert.assertTrue(org.junit.Assert.assertTrue) IOException(java.io.IOException) PipedOutputStream(java.io.PipedOutputStream) PCollection(org.apache.beam.sdk.values.PCollection) Request(org.elasticsearch.client.Request) Is.isA(org.hamcrest.core.Is.isA) NUM_SCIENTISTS(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIOTestUtils.NUM_SCIENTISTS) BoundedSource(org.apache.beam.sdk.io.BoundedSource) DEFAULT_RETRY_PREDICATE(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.RetryConfiguration.DEFAULT_RETRY_PREDICATE) Matcher(org.hamcrest.Matcher) Instant(org.joda.time.Instant) Collections(java.util.Collections) Assert.assertEquals(org.junit.Assert.assertEquals) IsIterableContainingInAnyOrder(org.hamcrest.collection.IsIterableContainingInAnyOrder) Write(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Write) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) KV(org.apache.beam.sdk.values.KV) Document(org.apache.beam.sdk.io.elasticsearch.ElasticsearchIO.Document)

Example 35 with SerializableFunction

use of org.apache.beam.sdk.transforms.SerializableFunction in project beam by apache.

the class BigQuerySourceBase method createSources.

List<BoundedSource<T>> createSources(List<ResourceId> files, TableSchema schema, List<MatchResult.Metadata> metadata) throws IOException, InterruptedException {
    final String jsonSchema = BigQueryIO.JSON_FACTORY.toString(schema);
    SerializableFunction<GenericRecord, T> fnWrapper = new SerializableFunction<GenericRecord, T>() {

        private Supplier<TableSchema> schema = Suppliers.memoize(Suppliers.compose(new TableSchemaFunction(), Suppliers.ofInstance(jsonSchema)));

        @Override
        public T apply(GenericRecord input) {
            return parseFn.apply(new SchemaAndRecord(input, schema.get()));
        }
    };
    List<BoundedSource<T>> avroSources = Lists.newArrayList();
    // mode.
    if (metadata != null) {
        for (MatchResult.Metadata file : metadata) {
            avroSources.add(AvroSource.from(file).withParseFn(fnWrapper, getOutputCoder()));
        }
    } else {
        for (ResourceId file : files) {
            avroSources.add(AvroSource.from(file.toString()).withParseFn(fnWrapper, getOutputCoder()));
        }
    }
    return ImmutableList.copyOf(avroSources);
}

Also used : BoundedSource(org.apache.beam.sdk.io.BoundedSource) SerializableFunction(org.apache.beam.sdk.transforms.SerializableFunction) MatchResult(org.apache.beam.sdk.io.fs.MatchResult) ResourceId(org.apache.beam.sdk.io.fs.ResourceId) Supplier(org.apache.beam.vendor.guava.v26_0_jre.com.google.common.base.Supplier) GenericRecord(org.apache.avro.generic.GenericRecord)

Aggregations

SerializableFunction (org.apache.beam.sdk.transforms.SerializableFunction)37 Test (org.junit.Test)27 TestPipeline (org.apache.beam.sdk.testing.TestPipeline)23 PCollection (org.apache.beam.sdk.values.PCollection)22 PAssert (org.apache.beam.sdk.testing.PAssert)20 Instant (org.joda.time.Instant)17 Rule (org.junit.Rule)17 List (java.util.List)16 MatcherAssert.assertThat (org.hamcrest.MatcherAssert.assertThat)16 RunWith (org.junit.runner.RunWith)16 Map (java.util.Map)15 Duration (org.joda.time.Duration)14 JUnit4 (org.junit.runners.JUnit4)13 ArrayList (java.util.ArrayList)12 Collections (java.util.Collections)12 Create (org.apache.beam.sdk.transforms.Create)12 Arrays (java.util.Arrays)11 ParDo (org.apache.beam.sdk.transforms.ParDo)11 KV (org.apache.beam.sdk.values.KV)11 Assert.assertEquals (org.junit.Assert.assertEquals)10