Search in sources :

Example 41 with BeamSqlEnv

use of org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv in project beam by apache.

the class SqlTransform method expand.

@Override
public PCollection<Row> expand(PInput input) {
    TableProvider inputTableProvider = new ReadOnlyTableProvider(PCOLLECTION_NAME, toTableMap(input));
    InMemoryMetaStore metaTableProvider = new InMemoryMetaStore();
    metaTableProvider.registerProvider(inputTableProvider);
    BeamSqlEnvBuilder sqlEnvBuilder = BeamSqlEnv.builder(metaTableProvider);
    // TODO: validate duplicate functions.
    registerFunctions(sqlEnvBuilder);
    // the same names are reused.
    if (autoLoading()) {
        sqlEnvBuilder.autoLoadUserDefinedFunctions();
        ServiceLoader.load(TableProvider.class).forEach(metaTableProvider::registerProvider);
    }
    tableProviderMap().forEach(sqlEnvBuilder::addSchema);
    @Nullable final String defaultTableProvider = defaultTableProvider();
    if (defaultTableProvider != null) {
        sqlEnvBuilder.setCurrentSchema(defaultTableProvider);
    }
    sqlEnvBuilder.setQueryPlannerClassName(MoreObjects.firstNonNull(queryPlannerClassName(), input.getPipeline().getOptions().as(BeamSqlPipelineOptions.class).getPlannerName()));
    sqlEnvBuilder.setPipelineOptions(input.getPipeline().getOptions());
    BeamSqlEnv sqlEnv = sqlEnvBuilder.build();
    ddlStrings().forEach(sqlEnv::executeDdl);
    return BeamSqlRelUtils.toPCollection(input.getPipeline(), sqlEnv.parseQuery(queryString(), queryParameters()), errorsTransformer());
}
Also used : BeamSqlPipelineOptions(org.apache.beam.sdk.extensions.sql.impl.BeamSqlPipelineOptions) ReadOnlyTableProvider(org.apache.beam.sdk.extensions.sql.meta.provider.ReadOnlyTableProvider) BeamSqlEnvBuilder(org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv.BeamSqlEnvBuilder) BeamSqlEnv(org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv) ReadOnlyTableProvider(org.apache.beam.sdk.extensions.sql.meta.provider.ReadOnlyTableProvider) TableProvider(org.apache.beam.sdk.extensions.sql.meta.provider.TableProvider) InMemoryMetaStore(org.apache.beam.sdk.extensions.sql.meta.store.InMemoryMetaStore) Nullable(org.checkerframework.checker.nullness.qual.Nullable)

Example 42 with BeamSqlEnv

use of org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv in project beam by apache.

the class PubsubTableProviderIT method testSQLReadAndWriteWithSameFlatTableDefinition.

@Test
@SuppressWarnings("unchecked")
public void testSQLReadAndWriteWithSameFlatTableDefinition() throws Exception {
    // This test verifies that the same pubsub table definition can be used for both reading and
    // writing
    // pipeline: Use SQL to insert data into `people`
    // filterPipeline: Use SQL to read from `people`, filter the rows, and write to
    // `javascript_people`
    String tblProperties = objectsProvider.getPayloadFormat() == null ? "" : String.format("TBLPROPERTIES '{ \"protoClass\" : \"%s\", \"format\": \"%s\" }'", PayloadMessages.NameHeightKnowsJSMessage.class.getName(), objectsProvider.getPayloadFormat());
    String createTableString = String.format("CREATE EXTERNAL TABLE people (\n" + "event_timestamp TIMESTAMP, \n" + "name VARCHAR, \n" + "height INTEGER, \n" + "knows_javascript BOOLEAN \n" + ") \n" + "TYPE '%s' \n" + "LOCATION '%s' \n" + "%s", tableProvider.getTableType(), eventsTopic.topicPath(), tblProperties);
    String filteredTblProperties = objectsProvider.getPayloadFormat() == null ? "" : String.format("TBLPROPERTIES '{ \"protoClass\" : \"%s\", \"format\": \"%s\" }'", PayloadMessages.NameHeightMessage.class.getName(), objectsProvider.getPayloadFormat());
    String createFilteredTableString = String.format("CREATE EXTERNAL TABLE javascript_people (\n" + "event_timestamp TIMESTAMP, \n" + "name VARCHAR, \n" + "height INTEGER \n" + ") \n" + "TYPE '%s' \n" + "LOCATION '%s' \n" + "%s", tableProvider.getTableType(), filteredEventsTopic.topicPath(), filteredTblProperties);
    // Initialize SQL environment and create the pubsub table
    BeamSqlEnv sqlEnv = BeamSqlEnv.inMemory(new PubsubTableProvider());
    sqlEnv.executeDdl(createTableString);
    sqlEnv.executeDdl(createFilteredTableString);
    // TODO(BEAM-8741): Ideally we could write these queries without specifying a column list,
    // because
    // it shouldn't be possible to write to event_timestamp when it's mapped to  publish time.
    String filterQueryString = "INSERT INTO javascript_people (name, height) (\n" + "  SELECT \n" + "    name, \n" + "    height \n" + "  FROM people \n" + "  WHERE knows_javascript \n" + ")";
    String injectQueryString = "INSERT INTO people (name, height, knows_javascript) VALUES \n" + "('person1', 80, TRUE),  \n" + "('person2', 70, FALSE), \n" + "('person3', 60, TRUE),  \n" + "('person4', 50, FALSE), \n" + "('person5', 40, TRUE)";
    // Apply the PTransform to do the filtering
    query(sqlEnv, filterPipeline, filterQueryString);
    // Apply the PTransform to inject the input data
    query(sqlEnv, pipeline, injectQueryString);
    // Start the filter pipeline and wait until it has started.
    filterPipeline.run();
    // Block until a subscription for this topic exists
    eventsTopic.assertSubscriptionEventuallyCreated(pipeline.getOptions().as(GcpOptions.class).getProject(), Duration.standardMinutes(5));
    // .. then run the injector pipeline
    pipeline.run().waitUntilFinish(Duration.standardMinutes(5));
    filteredEventsTopic.assertThatTopicEventuallyReceives(objectsProvider.matcherNameHeight("person1", 80), objectsProvider.matcherNameHeight("person3", 60), objectsProvider.matcherNameHeight("person5", 40)).waitForUpTo(Duration.standardMinutes(5));
}
Also used : PayloadMessages(org.apache.beam.sdk.extensions.protobuf.PayloadMessages) BeamSqlEnv(org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv) Test(org.junit.Test)

Example 43 with BeamSqlEnv

use of org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv in project beam by apache.

the class PubsubTableProviderIT method testSQLSelectsPayloadContent.

@Test
public void testSQLSelectsPayloadContent() throws Exception {
    String createTableString = String.format("CREATE EXTERNAL TABLE message (\n" + "event_timestamp TIMESTAMP, \n" + "attributes MAP<VARCHAR, VARCHAR>, \n" + "payload ROW< \n" + "             id INTEGER, \n" + "             name VARCHAR \n" + "           > \n" + ") \n" + "TYPE '%s' \n" + "LOCATION '%s' \n" + "TBLPROPERTIES '{ " + "%s" + "\"protoClass\" : \"%s\", " + "\"timestampAttributeKey\" : \"ts\" }'", tableProvider.getTableType(), eventsTopic.topicPath(), payloadFormatParam(), PayloadMessages.SimpleMessage.class.getName());
    String queryString = "SELECT message.payload.id, message.payload.name from message";
    // Initialize SQL environment and create the pubsub table
    BeamSqlEnv sqlEnv = BeamSqlEnv.inMemory(new PubsubTableProvider());
    sqlEnv.executeDdl(createTableString);
    // Apply the PTransform to query the pubsub topic
    PCollection<Row> queryOutput = query(sqlEnv, pipeline, queryString);
    // Observe the query results and send success signal after seeing the expected messages
    queryOutput.apply("waitForSuccess", resultSignal.signalSuccessWhen(SchemaCoder.of(PAYLOAD_SCHEMA), observedRows -> observedRows.equals(ImmutableSet.of(row(PAYLOAD_SCHEMA, 3, "foo"), row(PAYLOAD_SCHEMA, 5, "bar"), row(PAYLOAD_SCHEMA, 7, "baz")))));
    // Start the pipeline
    pipeline.run();
    // Block until a subscription for this topic exists
    eventsTopic.assertSubscriptionEventuallyCreated(pipeline.getOptions().as(GcpOptions.class).getProject(), Duration.standardMinutes(5));
    // Start publishing the messages when main pipeline is started and signaling topic is ready
    eventsTopic.publish(ImmutableList.of(objectsProvider.messageIdName(ts(1), 3, "foo"), objectsProvider.messageIdName(ts(2), 5, "bar"), objectsProvider.messageIdName(ts(3), 7, "baz")));
    // Poll the signaling topic for success message
    resultSignal.waitForSuccess(timeout);
}
Also used : Arrays(java.util.Arrays) LoggerFactory(org.slf4j.LoggerFactory) TimeoutException(java.util.concurrent.TimeoutException) PubsubMessage(org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage) Future(java.util.concurrent.Future) TestPubsub(org.apache.beam.sdk.io.gcp.pubsub.TestPubsub) ResultSet(java.sql.ResultSet) Map(java.util.Map) TestPubsubSignal(org.apache.beam.sdk.io.gcp.pubsub.TestPubsubSignal) Parameterized(org.junit.runners.Parameterized) ImmutableMap(org.apache.beam.vendor.calcite.v1_28_0.com.google.common.collect.ImmutableMap) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) Matchers.allOf(org.hamcrest.Matchers.allOf) Collection(java.util.Collection) SchemaCoder(org.apache.beam.sdk.schemas.SchemaCoder) Set(java.util.Set) FieldType(org.apache.beam.sdk.schemas.Schema.FieldType) SchemaIOTableProviderWrapper(org.apache.beam.sdk.extensions.sql.meta.provider.SchemaIOTableProviderWrapper) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) Executors(java.util.concurrent.Executors) ImmutableSet(org.apache.beam.vendor.calcite.v1_28_0.com.google.common.collect.ImmutableSet) Serializable(java.io.Serializable) List(java.util.List) Matchers.equalTo(org.hamcrest.Matchers.equalTo) JdbcDriver(org.apache.beam.sdk.extensions.sql.impl.JdbcDriver) ReflectHelpers(org.apache.beam.sdk.util.common.ReflectHelpers) ImmutableList(org.apache.beam.vendor.calcite.v1_28_0.com.google.common.collect.ImmutableList) GenericRecordBuilder(org.apache.avro.generic.GenericRecordBuilder) JsonMatcher.jsonBytesLike(org.apache.beam.sdk.testing.JsonMatcher.jsonBytesLike) ByteArrayOutputStream(java.io.ByteArrayOutputStream) InMemoryMetaStore(org.apache.beam.sdk.extensions.sql.meta.store.InMemoryMetaStore) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) Parameters(org.junit.runners.Parameterized.Parameters) HashMap(java.util.HashMap) Callable(java.util.concurrent.Callable) Matchers.hasProperty(org.hamcrest.Matchers.hasProperty) BeamSqlRelUtils(org.apache.beam.sdk.extensions.sql.impl.rel.BeamSqlRelUtils) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) BeamSqlEnv(org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv) Row(org.apache.beam.sdk.values.Row) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ExecutorService(java.util.concurrent.ExecutorService) AvroUtils(org.apache.beam.sdk.schemas.utils.AvroUtils) Matchers.hasEntry(org.hamcrest.Matchers.hasEntry) GenericRecord(org.apache.avro.generic.GenericRecord) Logger(org.slf4j.Logger) UTF_8(java.nio.charset.StandardCharsets.UTF_8) Parameter(org.junit.runners.Parameterized.Parameter) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) JdbcConnection(org.apache.beam.sdk.extensions.sql.impl.JdbcConnection) JsonProcessingException(com.fasterxml.jackson.core.JsonProcessingException) IOException(java.io.IOException) Test(org.junit.Test) PCollection(org.apache.beam.sdk.values.PCollection) AvroCoder(org.apache.beam.sdk.coders.AvroCoder) Schema(org.apache.beam.sdk.schemas.Schema) ExecutionException(java.util.concurrent.ExecutionException) TimeUnit(java.util.concurrent.TimeUnit) PayloadMessages(org.apache.beam.sdk.extensions.protobuf.PayloadMessages) Rule(org.junit.Rule) Ignore(org.junit.Ignore) Matcher(org.hamcrest.Matcher) Instant(org.joda.time.Instant) Statement(java.sql.Statement) TableProvider(org.apache.beam.sdk.extensions.sql.meta.provider.TableProvider) CalciteConnection(org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.jdbc.CalciteConnection) BeamSqlEnv(org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv) Row(org.apache.beam.sdk.values.Row) Test(org.junit.Test)

Example 44 with BeamSqlEnv

use of org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv in project beam by apache.

the class PubsubTableProviderIT method testSQLSelectsPayloadContentFlat.

@Test
public void testSQLSelectsPayloadContentFlat() throws Exception {
    String createTableString = String.format("CREATE EXTERNAL TABLE message (\n" + "event_timestamp TIMESTAMP, \n" + "id INTEGER, \n" + "name VARCHAR \n" + ") \n" + "TYPE '%s' \n" + "LOCATION '%s' \n" + "TBLPROPERTIES " + "    '{ " + "       %s" + "       \"protoClass\" : \"%s\", " + "       \"timestampAttributeKey\" : \"ts\" " + "     }'", tableProvider.getTableType(), eventsTopic.topicPath(), payloadFormatParam(), PayloadMessages.SimpleMessage.class.getName());
    String queryString = "SELECT message.id, message.name from message";
    // Initialize SQL environment and create the pubsub table
    BeamSqlEnv sqlEnv = BeamSqlEnv.inMemory(new PubsubTableProvider());
    sqlEnv.executeDdl(createTableString);
    // Apply the PTransform to query the pubsub topic
    PCollection<Row> queryOutput = query(sqlEnv, pipeline, queryString);
    // Observe the query results and send success signal after seeing the expected messages
    queryOutput.apply("waitForSuccess", resultSignal.signalSuccessWhen(SchemaCoder.of(PAYLOAD_SCHEMA), observedRows -> observedRows.equals(ImmutableSet.of(row(PAYLOAD_SCHEMA, 3, "foo"), row(PAYLOAD_SCHEMA, 5, "bar"), row(PAYLOAD_SCHEMA, 7, "baz")))));
    // Start the pipeline
    pipeline.run();
    // Block until a subscription for this topic exists
    eventsTopic.assertSubscriptionEventuallyCreated(pipeline.getOptions().as(GcpOptions.class).getProject(), Duration.standardMinutes(5));
    // Start publishing the messages when main pipeline is started and signaling topic is ready
    eventsTopic.publish(ImmutableList.of(objectsProvider.messageIdName(ts(1), 3, "foo"), objectsProvider.messageIdName(ts(2), 5, "bar"), objectsProvider.messageIdName(ts(3), 7, "baz")));
    // Poll the signaling topic for success message
    resultSignal.waitForSuccess(timeout);
}
Also used : Arrays(java.util.Arrays) LoggerFactory(org.slf4j.LoggerFactory) TimeoutException(java.util.concurrent.TimeoutException) PubsubMessage(org.apache.beam.sdk.io.gcp.pubsub.PubsubMessage) Future(java.util.concurrent.Future) TestPubsub(org.apache.beam.sdk.io.gcp.pubsub.TestPubsub) ResultSet(java.sql.ResultSet) Map(java.util.Map) TestPubsubSignal(org.apache.beam.sdk.io.gcp.pubsub.TestPubsubSignal) Parameterized(org.junit.runners.Parameterized) ImmutableMap(org.apache.beam.vendor.calcite.v1_28_0.com.google.common.collect.ImmutableMap) GcpOptions(org.apache.beam.sdk.extensions.gcp.options.GcpOptions) Matchers.allOf(org.hamcrest.Matchers.allOf) Collection(java.util.Collection) SchemaCoder(org.apache.beam.sdk.schemas.SchemaCoder) Set(java.util.Set) FieldType(org.apache.beam.sdk.schemas.Schema.FieldType) SchemaIOTableProviderWrapper(org.apache.beam.sdk.extensions.sql.meta.provider.SchemaIOTableProviderWrapper) Collectors(java.util.stream.Collectors) StandardCharsets(java.nio.charset.StandardCharsets) Executors(java.util.concurrent.Executors) ImmutableSet(org.apache.beam.vendor.calcite.v1_28_0.com.google.common.collect.ImmutableSet) Serializable(java.io.Serializable) List(java.util.List) Matchers.equalTo(org.hamcrest.Matchers.equalTo) JdbcDriver(org.apache.beam.sdk.extensions.sql.impl.JdbcDriver) ReflectHelpers(org.apache.beam.sdk.util.common.ReflectHelpers) ImmutableList(org.apache.beam.vendor.calcite.v1_28_0.com.google.common.collect.ImmutableList) GenericRecordBuilder(org.apache.avro.generic.GenericRecordBuilder) JsonMatcher.jsonBytesLike(org.apache.beam.sdk.testing.JsonMatcher.jsonBytesLike) ByteArrayOutputStream(java.io.ByteArrayOutputStream) InMemoryMetaStore(org.apache.beam.sdk.extensions.sql.meta.store.InMemoryMetaStore) Duration(org.joda.time.Duration) RunWith(org.junit.runner.RunWith) Parameters(org.junit.runners.Parameterized.Parameters) HashMap(java.util.HashMap) Callable(java.util.concurrent.Callable) Matchers.hasProperty(org.hamcrest.Matchers.hasProperty) BeamSqlRelUtils(org.apache.beam.sdk.extensions.sql.impl.rel.BeamSqlRelUtils) TestPipeline(org.apache.beam.sdk.testing.TestPipeline) MatcherAssert.assertThat(org.hamcrest.MatcherAssert.assertThat) BeamSqlEnv(org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv) Row(org.apache.beam.sdk.values.Row) PipelineOptions(org.apache.beam.sdk.options.PipelineOptions) ExecutorService(java.util.concurrent.ExecutorService) AvroUtils(org.apache.beam.sdk.schemas.utils.AvroUtils) Matchers.hasEntry(org.hamcrest.Matchers.hasEntry) GenericRecord(org.apache.avro.generic.GenericRecord) Logger(org.slf4j.Logger) UTF_8(java.nio.charset.StandardCharsets.UTF_8) Parameter(org.junit.runners.Parameterized.Parameter) ObjectMapper(com.fasterxml.jackson.databind.ObjectMapper) JdbcConnection(org.apache.beam.sdk.extensions.sql.impl.JdbcConnection) JsonProcessingException(com.fasterxml.jackson.core.JsonProcessingException) IOException(java.io.IOException) Test(org.junit.Test) PCollection(org.apache.beam.sdk.values.PCollection) AvroCoder(org.apache.beam.sdk.coders.AvroCoder) Schema(org.apache.beam.sdk.schemas.Schema) ExecutionException(java.util.concurrent.ExecutionException) TimeUnit(java.util.concurrent.TimeUnit) PayloadMessages(org.apache.beam.sdk.extensions.protobuf.PayloadMessages) Rule(org.junit.Rule) Ignore(org.junit.Ignore) Matcher(org.hamcrest.Matcher) Instant(org.joda.time.Instant) Statement(java.sql.Statement) TableProvider(org.apache.beam.sdk.extensions.sql.meta.provider.TableProvider) CalciteConnection(org.apache.beam.vendor.calcite.v1_28_0.org.apache.calcite.jdbc.CalciteConnection) BeamSqlEnv(org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv) Row(org.apache.beam.sdk.values.Row) Test(org.junit.Test)

Example 45 with BeamSqlEnv

use of org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv in project beam by apache.

the class ThreeTablesSchema method createThreeTables.

private void createThreeTables(TestTableProvider tableProvider) {
    BeamSqlEnv env = BeamSqlEnv.withTableProvider(tableProvider);
    env.executeDdl("CREATE EXTERNAL TABLE small_table (id INTEGER, medium_key INTEGER) TYPE text");
    env.executeDdl("CREATE EXTERNAL TABLE medium_table (" + "id INTEGER," + "small_key INTEGER," + "large_key INTEGER" + ") TYPE text");
    env.executeDdl("CREATE EXTERNAL TABLE large_table (" + "id INTEGER," + "medium_key INTEGER" + ") TYPE text");
    Row row = Row.withSchema(tableProvider.getTable("small_table").getSchema()).addValues(1, 1).build();
    tableProvider.addRows("small_table", row);
    for (int i = 0; i < 3; i++) {
        row = Row.withSchema(tableProvider.getTable("medium_table").getSchema()).addValues(i, 1, 2).build();
        tableProvider.addRows("medium_table", row);
    }
    for (int i = 0; i < 100; i++) {
        row = Row.withSchema(tableProvider.getTable("large_table").getSchema()).addValues(i, 2).build();
        tableProvider.addRows("large_table", row);
    }
}
Also used : BeamSqlEnv(org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv) Row(org.apache.beam.sdk.values.Row)

Aggregations

BeamSqlEnv (org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv)61 Test (org.junit.Test)54 Row (org.apache.beam.sdk.values.Row)36 TestTableProvider (org.apache.beam.sdk.extensions.sql.meta.provider.test.TestTableProvider)16 PipelineResult (org.apache.beam.sdk.PipelineResult)10 State (org.apache.beam.sdk.PipelineResult.State)10 Schema (org.apache.beam.sdk.schemas.Schema)10 TableProvider (org.apache.beam.sdk.extensions.sql.meta.provider.TableProvider)8 Arrays (java.util.Arrays)6 List (java.util.List)6 Map (java.util.Map)6 ExecutorService (java.util.concurrent.ExecutorService)6 Collectors (java.util.stream.Collectors)6 PayloadMessages (org.apache.beam.sdk.extensions.protobuf.PayloadMessages)6 JsonProcessingException (com.fasterxml.jackson.core.JsonProcessingException)5 ObjectMapper (com.fasterxml.jackson.databind.ObjectMapper)5 ByteArrayOutputStream (java.io.ByteArrayOutputStream)5 IOException (java.io.IOException)5 Serializable (java.io.Serializable)5 StandardCharsets (java.nio.charset.StandardCharsets)5