use of org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv in project beam by apache.
the class SqlTransform method expand.
@Override
public PCollection<Row> expand(PInput input) {
TableProvider inputTableProvider = new ReadOnlyTableProvider(PCOLLECTION_NAME, toTableMap(input));
InMemoryMetaStore metaTableProvider = new InMemoryMetaStore();
metaTableProvider.registerProvider(inputTableProvider);
BeamSqlEnvBuilder sqlEnvBuilder = BeamSqlEnv.builder(metaTableProvider);
// TODO: validate duplicate functions.
registerFunctions(sqlEnvBuilder);
// the same names are reused.
if (autoLoading()) {
sqlEnvBuilder.autoLoadUserDefinedFunctions();
ServiceLoader.load(TableProvider.class).forEach(metaTableProvider::registerProvider);
}
tableProviderMap().forEach(sqlEnvBuilder::addSchema);
@Nullable final String defaultTableProvider = defaultTableProvider();
if (defaultTableProvider != null) {
sqlEnvBuilder.setCurrentSchema(defaultTableProvider);
}
sqlEnvBuilder.setQueryPlannerClassName(MoreObjects.firstNonNull(queryPlannerClassName(), input.getPipeline().getOptions().as(BeamSqlPipelineOptions.class).getPlannerName()));
sqlEnvBuilder.setPipelineOptions(input.getPipeline().getOptions());
BeamSqlEnv sqlEnv = sqlEnvBuilder.build();
ddlStrings().forEach(sqlEnv::executeDdl);
return BeamSqlRelUtils.toPCollection(input.getPipeline(), sqlEnv.parseQuery(queryString(), queryParameters()), errorsTransformer());
}
use of org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv in project beam by apache.
the class PubsubTableProviderIT method testSQLReadAndWriteWithSameFlatTableDefinition.
@Test
@SuppressWarnings("unchecked")
public void testSQLReadAndWriteWithSameFlatTableDefinition() throws Exception {
// This test verifies that the same pubsub table definition can be used for both reading and
// writing
// pipeline: Use SQL to insert data into `people`
// filterPipeline: Use SQL to read from `people`, filter the rows, and write to
// `javascript_people`
String tblProperties = objectsProvider.getPayloadFormat() == null ? "" : String.format("TBLPROPERTIES '{ \"protoClass\" : \"%s\", \"format\": \"%s\" }'", PayloadMessages.NameHeightKnowsJSMessage.class.getName(), objectsProvider.getPayloadFormat());
String createTableString = String.format("CREATE EXTERNAL TABLE people (\n" + "event_timestamp TIMESTAMP, \n" + "name VARCHAR, \n" + "height INTEGER, \n" + "knows_javascript BOOLEAN \n" + ") \n" + "TYPE '%s' \n" + "LOCATION '%s' \n" + "%s", tableProvider.getTableType(), eventsTopic.topicPath(), tblProperties);
String filteredTblProperties = objectsProvider.getPayloadFormat() == null ? "" : String.format("TBLPROPERTIES '{ \"protoClass\" : \"%s\", \"format\": \"%s\" }'", PayloadMessages.NameHeightMessage.class.getName(), objectsProvider.getPayloadFormat());
String createFilteredTableString = String.format("CREATE EXTERNAL TABLE javascript_people (\n" + "event_timestamp TIMESTAMP, \n" + "name VARCHAR, \n" + "height INTEGER \n" + ") \n" + "TYPE '%s' \n" + "LOCATION '%s' \n" + "%s", tableProvider.getTableType(), filteredEventsTopic.topicPath(), filteredTblProperties);
// Initialize SQL environment and create the pubsub table
BeamSqlEnv sqlEnv = BeamSqlEnv.inMemory(new PubsubTableProvider());
sqlEnv.executeDdl(createTableString);
sqlEnv.executeDdl(createFilteredTableString);
// TODO(BEAM-8741): Ideally we could write these queries without specifying a column list,
// because
// it shouldn't be possible to write to event_timestamp when it's mapped to publish time.
String filterQueryString = "INSERT INTO javascript_people (name, height) (\n" + " SELECT \n" + " name, \n" + " height \n" + " FROM people \n" + " WHERE knows_javascript \n" + ")";
String injectQueryString = "INSERT INTO people (name, height, knows_javascript) VALUES \n" + "('person1', 80, TRUE), \n" + "('person2', 70, FALSE), \n" + "('person3', 60, TRUE), \n" + "('person4', 50, FALSE), \n" + "('person5', 40, TRUE)";
// Apply the PTransform to do the filtering
query(sqlEnv, filterPipeline, filterQueryString);
// Apply the PTransform to inject the input data
query(sqlEnv, pipeline, injectQueryString);
// Start the filter pipeline and wait until it has started.
filterPipeline.run();
// Block until a subscription for this topic exists
eventsTopic.assertSubscriptionEventuallyCreated(pipeline.getOptions().as(GcpOptions.class).getProject(), Duration.standardMinutes(5));
// .. then run the injector pipeline
pipeline.run().waitUntilFinish(Duration.standardMinutes(5));
filteredEventsTopic.assertThatTopicEventuallyReceives(objectsProvider.matcherNameHeight("person1", 80), objectsProvider.matcherNameHeight("person3", 60), objectsProvider.matcherNameHeight("person5", 40)).waitForUpTo(Duration.standardMinutes(5));
}
use of org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv in project beam by apache.
the class PubsubTableProviderIT method testSQLSelectsPayloadContent.
@Test
public void testSQLSelectsPayloadContent() throws Exception {
String createTableString = String.format("CREATE EXTERNAL TABLE message (\n" + "event_timestamp TIMESTAMP, \n" + "attributes MAP<VARCHAR, VARCHAR>, \n" + "payload ROW< \n" + " id INTEGER, \n" + " name VARCHAR \n" + " > \n" + ") \n" + "TYPE '%s' \n" + "LOCATION '%s' \n" + "TBLPROPERTIES '{ " + "%s" + "\"protoClass\" : \"%s\", " + "\"timestampAttributeKey\" : \"ts\" }'", tableProvider.getTableType(), eventsTopic.topicPath(), payloadFormatParam(), PayloadMessages.SimpleMessage.class.getName());
String queryString = "SELECT message.payload.id, message.payload.name from message";
// Initialize SQL environment and create the pubsub table
BeamSqlEnv sqlEnv = BeamSqlEnv.inMemory(new PubsubTableProvider());
sqlEnv.executeDdl(createTableString);
// Apply the PTransform to query the pubsub topic
PCollection<Row> queryOutput = query(sqlEnv, pipeline, queryString);
// Observe the query results and send success signal after seeing the expected messages
queryOutput.apply("waitForSuccess", resultSignal.signalSuccessWhen(SchemaCoder.of(PAYLOAD_SCHEMA), observedRows -> observedRows.equals(ImmutableSet.of(row(PAYLOAD_SCHEMA, 3, "foo"), row(PAYLOAD_SCHEMA, 5, "bar"), row(PAYLOAD_SCHEMA, 7, "baz")))));
// Start the pipeline
pipeline.run();
// Block until a subscription for this topic exists
eventsTopic.assertSubscriptionEventuallyCreated(pipeline.getOptions().as(GcpOptions.class).getProject(), Duration.standardMinutes(5));
// Start publishing the messages when main pipeline is started and signaling topic is ready
eventsTopic.publish(ImmutableList.of(objectsProvider.messageIdName(ts(1), 3, "foo"), objectsProvider.messageIdName(ts(2), 5, "bar"), objectsProvider.messageIdName(ts(3), 7, "baz")));
// Poll the signaling topic for success message
resultSignal.waitForSuccess(timeout);
}
use of org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv in project beam by apache.
the class PubsubTableProviderIT method testSQLSelectsPayloadContentFlat.
@Test
public void testSQLSelectsPayloadContentFlat() throws Exception {
String createTableString = String.format("CREATE EXTERNAL TABLE message (\n" + "event_timestamp TIMESTAMP, \n" + "id INTEGER, \n" + "name VARCHAR \n" + ") \n" + "TYPE '%s' \n" + "LOCATION '%s' \n" + "TBLPROPERTIES " + " '{ " + " %s" + " \"protoClass\" : \"%s\", " + " \"timestampAttributeKey\" : \"ts\" " + " }'", tableProvider.getTableType(), eventsTopic.topicPath(), payloadFormatParam(), PayloadMessages.SimpleMessage.class.getName());
String queryString = "SELECT message.id, message.name from message";
// Initialize SQL environment and create the pubsub table
BeamSqlEnv sqlEnv = BeamSqlEnv.inMemory(new PubsubTableProvider());
sqlEnv.executeDdl(createTableString);
// Apply the PTransform to query the pubsub topic
PCollection<Row> queryOutput = query(sqlEnv, pipeline, queryString);
// Observe the query results and send success signal after seeing the expected messages
queryOutput.apply("waitForSuccess", resultSignal.signalSuccessWhen(SchemaCoder.of(PAYLOAD_SCHEMA), observedRows -> observedRows.equals(ImmutableSet.of(row(PAYLOAD_SCHEMA, 3, "foo"), row(PAYLOAD_SCHEMA, 5, "bar"), row(PAYLOAD_SCHEMA, 7, "baz")))));
// Start the pipeline
pipeline.run();
// Block until a subscription for this topic exists
eventsTopic.assertSubscriptionEventuallyCreated(pipeline.getOptions().as(GcpOptions.class).getProject(), Duration.standardMinutes(5));
// Start publishing the messages when main pipeline is started and signaling topic is ready
eventsTopic.publish(ImmutableList.of(objectsProvider.messageIdName(ts(1), 3, "foo"), objectsProvider.messageIdName(ts(2), 5, "bar"), objectsProvider.messageIdName(ts(3), 7, "baz")));
// Poll the signaling topic for success message
resultSignal.waitForSuccess(timeout);
}
use of org.apache.beam.sdk.extensions.sql.impl.BeamSqlEnv in project beam by apache.
the class ThreeTablesSchema method createThreeTables.
private void createThreeTables(TestTableProvider tableProvider) {
BeamSqlEnv env = BeamSqlEnv.withTableProvider(tableProvider);
env.executeDdl("CREATE EXTERNAL TABLE small_table (id INTEGER, medium_key INTEGER) TYPE text");
env.executeDdl("CREATE EXTERNAL TABLE medium_table (" + "id INTEGER," + "small_key INTEGER," + "large_key INTEGER" + ") TYPE text");
env.executeDdl("CREATE EXTERNAL TABLE large_table (" + "id INTEGER," + "medium_key INTEGER" + ") TYPE text");
Row row = Row.withSchema(tableProvider.getTable("small_table").getSchema()).addValues(1, 1).build();
tableProvider.addRows("small_table", row);
for (int i = 0; i < 3; i++) {
row = Row.withSchema(tableProvider.getTable("medium_table").getSchema()).addValues(i, 1, 2).build();
tableProvider.addRows("medium_table", row);
}
for (int i = 0; i < 100; i++) {
row = Row.withSchema(tableProvider.getTable("large_table").getSchema()).addValues(i, 2).build();
tableProvider.addRows("large_table", row);
}
}
Aggregations